diff --git a/.gitignore b/.gitignore index 7422cead3..2c298e3b4 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h lapack-netlib/TESTING/testing_results.txt *.so +*.so.* *.a .svn *~ @@ -65,3 +66,5 @@ test/sblat3 test/zblat1 test/zblat2 test/zblat3 +build +build.* diff --git a/.travis.yml b/.travis.yml index 7d625c9dc..806cb0046 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,13 @@ language: c + +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always + compiler: - gcc diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..3b436dc13 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,190 @@ +## +## Author: Hank Anderson +## + +cmake_minimum_required(VERSION 2.8.4) +project(OpenBLAS) +set(OpenBLAS_MAJOR_VERSION 0) +set(OpenBLAS_MINOR_VERSION 2) +set(OpenBLAS_PATCH_VERSION 14) +set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") + +enable_language(ASM) +enable_language(C) + +if(MSVC) +set(OpenBLAS_LIBNAME libopenblas) +else() +set(OpenBLAS_LIBNAME openblas) +endif() + +####### +if(MSVC) +option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) +endif() +option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) +option(BUILD_DEBUG "Build Debug Version" OFF) +####### +if(BUILD_WITHOUT_LAPACK) +set(NO_LAPACK 1) +set(NO_LAPACKE 1) +endif() + +if(BUILD_DEBUG) +set(CMAKE_BUILD_TYPE Debug) +else() +set(CMAKE_BUILD_TYPE Release) +endif() + +if(BUILD_WITHOUT_CBLAS) +set(NO_CBLAS 1) +endif() + +####### + + +message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") + +include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") +include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") + +set(BLASDIRS interface driver/level2 driver/level3 driver/others) + +if (NOT DYNAMIC_ARCH) + list(APPEND BLASDIRS kernel) +endif () + +if (DEFINED UTEST_CHECK) + set(SANITY_CHECK 1) +endif () + +if (DEFINED SANITY_CHECK) + list(APPEND BLASDIRS reference) +endif () + +set(SUBDIRS ${BLASDIRS}) +if (NOT NO_LAPACK) + list(APPEND SUBDIRS lapack) +endif () + +# set which float types we want to build for +if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) + # if none are defined, build for all + set(BUILD_SINGLE true) + set(BUILD_DOUBLE true) + set(BUILD_COMPLEX true) + set(BUILD_COMPLEX16 true) +endif () + +set(FLOAT_TYPES "") +if (BUILD_SINGLE) + message(STATUS "Building Single Precision") + list(APPEND FLOAT_TYPES "SINGLE") # defines nothing +endif () + +if (BUILD_DOUBLE) + message(STATUS "Building Double Precision") + list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE +endif () + +if (BUILD_COMPLEX) + message(STATUS "Building Complex Precision") + list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX +endif () + +if (BUILD_COMPLEX16) + message(STATUS "Building Double Complex Precision") + list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE +endif () + +set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench) + +# all :: libs netlib tests shared + +# libs : +if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") + message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") +endif () + +if (${NO_STATIC} AND ${NO_SHARED}) + message(FATAL_ERROR "Neither static nor shared are enabled.") +endif () + +# get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) +set(TARGET_OBJS "") +foreach (SUBDIR ${SUBDIRS}) + add_subdirectory(${SUBDIR}) + string(REPLACE "/" "_" subdir_obj ${SUBDIR}) + list(APPEND TARGET_OBJS "$") +endforeach () + +# netlib: + +# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. +# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. +if (NOT NOFORTRAN AND NOT NO_LAPACK) + include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") +if (NOT NO_LAPACKE) + include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") +endif () +endif () + +#Only generate .def for dll on MSVC +if(MSVC) +set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") +endif() + +# add objects to the openblas lib +add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + +include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") + + +if(NOT MSVC) +#only build shared library for MSVC +add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) +set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) +set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) + +if(SMP) +target_link_libraries(${OpenBLAS_LIBNAME} pthread) +target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) +endif() + +#build test and ctest +enable_testing() +add_subdirectory(test) +if(NOT NO_CBLAS) +add_subdirectory(ctest) +endif() +endif() + +set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES + VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} + SOVERSION ${OpenBLAS_MAJOR_VERSION} +) + + +# TODO: Why is the config saved here? Is this necessary with CMake? +#Save the config files for installation +# @cp Makefile.conf Makefile.conf_last +# @cp config.h config_last.h +#ifdef QUAD_PRECISION +# @echo "#define QUAD_PRECISION">> config_last.h +#endif +#ifeq ($(EXPRECISION), 1) +# @echo "#define EXPRECISION">> config_last.h +#endif +### +#ifeq ($(DYNAMIC_ARCH), 1) +# @$(MAKE) -C kernel commonlibs || exit 1 +# @for d in $(DYNAMIC_CORE) ; \ +# do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ +# done +# @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +#endif +#ifdef USE_THREAD +# @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last +#endif +# @touch lib.grd + diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b88e3671b..88e461dc4 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -127,5 +127,8 @@ In chronological order: * Ton van den Heuvel * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). +* Martin Koehler + * [2015-09-07] Improved imatcopy + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/Changelog.txt b/Changelog.txt index 6941a9f96..422b8b519 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,57 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.15 +27-Oct-2015 +common: + * Support cmake on x86/x86-64. Natively compiling on MS Visual Studio. + (experimental. Thank Hank Anderson for the initial cmake porting work.) + + On Linux and Mac OSX, OpenBLAS cmake supports assembly kernels. + e.g. cmake . + make + make test (Optional) + + On Windows MS Visual Studio, OpenBLAS cmake only support C kernels. + (OpenBLAS uses AT&T style assembly, which is not supported by MSVC.) + e.g. cmake -G "Visual Studio 12 Win64" . + Open OpenBLAS.sln and build. + + * Enable MAX_STACK_ALLOC flags by default. + Improve ger and gemv for small matrices. + * Improve gemv parallel with small m and large n case. + * Improve ?imatcopy when lda==ldb (#633. Thanks, Martin Koehler) + * Add vecLib benchmarks (#565. Thanks, Andreas Noack.) + * Fix LAPACK lantr for row major matrices (#634. Thanks, Dan Kortschak) + * Fix LAPACKE lansy (#640. Thanks, Dan Kortschak) + * Import bug fixes for LAPACKE s/dormlq, c/zunmlq + * Raise the signal when pthread_create fails (#668. Thanks, James K. Lowden) + * Remove g77 from compiler list. + * Enable AppVeyor Windows CI. + +x86/x86-64: + * Support pure C generic kernels for x86/x86-64. + * Support Intel Boardwell and Skylake by Haswell kernels. + * Support AMD Excavator by Steamroller kernels. + * Optimize s/d/c/zdot for Intel SandyBridge and Haswell. + * Optimize s/d/c/zdot for AMD Piledriver and Steamroller. + * Optimize s/d/c/zapxy for Intel SandyBridge and Haswell. + * Optimize s/d/c/zapxy for AMD Piledriver and Steamroller. + * Optimize d/c/zscal for Intel Haswell, dscal for Intel SandyBridge. + * Optimize d/c/zscal for AMD Bulldozer, Piledriver and Steamroller. + * Optimize s/dger for Intel SandyBridge. + * Optimize s/dsymv for Intel SandyBridge. + * Optimize ssymv for Intel Haswell. + * Optimize dgemv for Intel Nehalem and Haswell. + * Optimize dtrmm for Intel Haswell. + +ARM: + * Support Android NDK armeabi-v7a-hard ABI (-mfloat-abi=hard) + e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7 + * Fix lock, rpcc bugs (#616, #617. Thanks, Grazvydas Ignotas) +POWER: + * Support ppc64le platform (ELF ABI v2. #612. Thanks, Matthew Brandyberry.) + * Support POWER7/8 by POWER6 kernels. (#612. Thanks, Fábio Perez.) + ==================================================================== Version 0.2.14 24-Mar-2015 diff --git a/Makefile b/Makefile index 3aaf092fc..6ad87d802 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,8 @@ ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack endif +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) + SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench .PHONY : all libs netlib test ctest shared install @@ -131,7 +133,7 @@ ifeq ($(CORE), UNKOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif ifeq ($(NOFORTRAN), 1) - $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) + $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) endif ifeq ($(NO_STATIC), 1) ifeq ($(NO_SHARED), 1) @@ -231,7 +233,7 @@ ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.arm b/Makefile.arm index 9978a672a..272220ca9 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,13 +1,23 @@ # ifeq logical or ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(OSNAME), Android) +CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +else CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a endif +endif ifeq ($(CORE), ARMV7) +ifeq ($(OSNAME), Android) +CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +else CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a endif +endif ifeq ($(CORE), ARMV6) CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 @@ -16,8 +26,8 @@ endif ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 -FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +CCOMMON_OPT += -marm -march=armv5 +FCOMMON_OPT += -marm -march=armv5 endif diff --git a/Makefile.install b/Makefile.install index e1deaae3e..9814302b0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -11,6 +11,7 @@ OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake +OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake .PHONY : install .NOTPARALLEL : install @@ -86,8 +87,8 @@ ifeq ($(OSNAME), Darwin) ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) - @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) - @-cp $(LIBDLLNAME).a $(OPENBLAS_LIBRARY_DIR) + @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) @@ -97,6 +98,7 @@ endif @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) @@ -112,5 +114,16 @@ else #only static @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) endif +#Generating OpenBLASConfigVersion.cmake + @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo Install OK! diff --git a/Makefile.rule b/Makefile.rule index 1479de660..459f79c26 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.14 +VERSION = 0.2.15 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -162,13 +162,16 @@ COMMON_PROF = -pg # Improve GEMV and GER for small matrices by stack allocation. # For details, https://github.com/xianyi/OpenBLAS/pull/482 # -# MAX_STACK_ALLOC=2048 + MAX_STACK_ALLOC=2048 # Add a prefix or suffix to all exported symbol names in the shared library. # Avoid conflicts with other BLAS libraries, especially when using # 64 bit integer interfaces in OpenBLAS. # For details, https://github.com/xianyi/OpenBLAS/pull/459 # +# The same prefix and suffix are also added to the library name, +# i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas +# # SYMBOLPREFIX= # SYMBOLSUFFIX= diff --git a/Makefile.system b/Makefile.system index 525daa41b..42ad49849 100644 --- a/Makefile.system +++ b/Makefile.system @@ -23,6 +23,7 @@ CC = gcc UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) CC = clang +# EXTRALIB += -Wl,-no_compact_unwind endif endif @@ -64,6 +65,9 @@ endif ifeq ($(TARGET), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), EXCAVATOR) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -91,6 +95,9 @@ endif ifeq ($(TARGET_CORE), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET_CORE), EXCAVATOR) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -195,12 +202,18 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv + +# For detect fortran failed, only build BLAS. +ifeq ($(NOFORTRAN), 1) +NO_LAPACK = 1 +endif + # # OS dependent settings # ifeq ($(OSNAME), Darwin) -export MACOSX_DEPLOYMENT_TARGET=10.2 +export MACOSX_DEPLOYMENT_TARGET=10.6 MD5SUM = md5 -r endif @@ -323,6 +336,11 @@ ifeq ($(ARCH), x86) ifndef BINARY NO_BINARY_MODE = 1 endif + +ifeq ($(CORE), generic) +NO_EXPRECISION = 1 +endif + ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB @@ -341,6 +359,11 @@ endif endif ifeq ($(ARCH), x86_64) + +ifeq ($(CORE), generic) +NO_EXPRECISION = 1 +endif + ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB @@ -408,7 +431,7 @@ endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL @@ -578,7 +601,7 @@ else FCOMMON_OPT += -m32 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif endif @@ -590,14 +613,14 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifeq ($(F_COMPILER), FUJITSU) CCOMMON_OPT += -DF_INTERFACE_FUJITSU -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif @@ -615,7 +638,7 @@ endif else FCOMMON_OPT += -q32 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif @@ -633,7 +656,7 @@ FCOMMON_OPT += -tp p7-64 else FCOMMON_OPT += -tp p7 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif @@ -662,7 +685,7 @@ FCOMMON_OPT += -mabi=n32 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif @@ -699,7 +722,7 @@ FCOMMON_OPT += -m64 endif endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FEXTRALIB += -lstdc++ FCOMMON_OPT += -mp endif @@ -747,14 +770,14 @@ FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -xopenmp=parallel endif endif ifeq ($(F_COMPILER), COMPAQ) CCOMMON_OPT += -DF_INTERFACE_COMPAQ -ifdef USE_OPENMP +ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif @@ -857,12 +880,6 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifndef LIBNAMESUFFIX -LIBPREFIX = libopenblas -else -LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) -endif - ifndef SYMBOLPREFIX SYMBOLPREFIX = endif @@ -871,6 +888,12 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBNAMESUFFIX +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +else +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +endif + KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) diff --git a/README.md b/README.md index cdacf9888..0ec86d362 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,10 @@ # OpenBLAS -[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) + +AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. diff --git a/TargetList.txt b/TargetList.txt index 1c985080b..b2878ba32 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -33,6 +33,7 @@ BOBCAT BULLDOZER PILEDRIVER STEAMROLLER +EXCAVATOR c)VIA CPU: SSE_GENERIC @@ -43,6 +44,8 @@ NANO POWER4 POWER5 POWER6 +POWER7 +POWER8 PPCG4 PPC970 PPC970MP diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..394e48854 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,42 @@ +version: 0.2.15.{build} + +#environment: + +platform: + - x64 + +configuration: Release + +clone_folder: c:\projects\OpenBLAS + +init: + - git config --global core.autocrlf input + +build: + project: OpenBLAS.sln + +clone_depth: 5 + +#branches to build +branches: + only: + - master + - develop + - cmake + +skip_tags: true + +matrix: + fast_finish: true + +skip_commits: +# Add [av skip] to commit messages + message: /\[av skip\]/ + +before_build: + - echo Running cmake... + - cd c:\projects\OpenBLAS + - cmake -G "Visual Studio 12 Win64" . + +test_script: + - echo Build OK! diff --git a/benchmark/Make_exe.sh b/benchmark/Make_exe.sh new file mode 100755 index 000000000..4304f6fb3 --- /dev/null +++ b/benchmark/Make_exe.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +for f in *.goto *.acml *.mkl *.atlas +do + if [ -f "$f" ]; then + mv $f `echo $f|tr '.' '_'`.exe + fi +done + diff --git a/benchmark/Makefile b/benchmark/Makefile index b5eaa9343..492d2617f 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -10,7 +10,7 @@ include $(TOPDIR)/Makefile.system #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML 6.1 custom -ACML=/home/werner/project/acml6.1/gfortran64_mp/lib +ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm @@ -30,7 +30,10 @@ LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf MKL=/home/saar/intel_mkl LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm +# Apple vecLib +LIBVECLIB = -framework Accelerate +ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ @@ -39,9 +42,13 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto \ + sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -49,6 +56,114 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + +else + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto @@ -60,9 +175,13 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml \ + sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -70,6 +189,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ ssymm.acml dsymm.acml csymm.acml zsymm.acml @@ -81,9 +201,13 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -92,6 +216,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas @@ -103,9 +228,13 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl \ - sdot.mkl ddot.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -113,20 +242,56 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + + +endif + + + +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib + goto_3m :: cgemm3m.goto zgemm3m.goto -mkl_3m :: cgemm3m.mkl zgemm3m.mkl +mkl_3m :: cgemm3m.mkl zgemm3m.mkl -all :: goto mkl atlas acml +all :: goto mkl atlas acml veclib + +exe : + @./Make_exe.sh ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm slinpack.acml : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -137,9 +302,12 @@ slinpack.atlas : slinpack.$(SUFFIX) slinpack.mkl : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dlinpack.acml : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -150,10 +318,13 @@ dlinpack.atlas : dlinpack.$(SUFFIX) dlinpack.mkl : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm clinpack.acml : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -164,10 +335,13 @@ clinpack.atlas : clinpack.$(SUFFIX) clinpack.mkl : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zlinpack.acml : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -178,10 +352,13 @@ zlinpack.atlas : zlinpack.$(SUFFIX) zlinpack.mkl : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scholesky.acml : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -192,10 +369,13 @@ scholesky.atlas : scholesky.$(SUFFIX) scholesky.mkl : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dcholesky.acml : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -206,10 +386,13 @@ dcholesky.atlas : dcholesky.$(SUFFIX) dcholesky.mkl : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ccholesky.acml : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -220,13 +403,14 @@ ccholesky.atlas : ccholesky.$(SUFFIX) ccholesky.mkl : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zcholesky ################################################### -xcholesky.goto : xcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zcholesky.acml : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -237,10 +421,12 @@ zcholesky.atlas : zcholesky.$(SUFFIX) zcholesky.mkl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemm.acml : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -251,9 +437,12 @@ sgemm.atlas : sgemm.$(SUFFIX) sgemm.mkl : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemm.acml : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -264,10 +453,13 @@ dgemm.atlas : dgemm.$(SUFFIX) dgemm.mkl : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm.acml : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -278,10 +470,13 @@ cgemm.atlas : cgemm.$(SUFFIX) cgemm.mkl : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm.acml : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -292,9 +487,12 @@ zgemm.atlas : zgemm.$(SUFFIX) zgemm.mkl : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymm.acml : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -305,9 +503,12 @@ ssymm.atlas : ssymm.$(SUFFIX) ssymm.mkl : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymm #################################################### dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymm.acml : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -318,10 +519,13 @@ dsymm.atlas : dsymm.$(SUFFIX) dsymm.mkl : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csymm #################################################### csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymm.acml : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -332,10 +536,13 @@ csymm.atlas : csymm.$(SUFFIX) csymm.mkl : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsymm #################################################### zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymm.acml : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -346,9 +553,12 @@ zsymm.atlas : zsymm.$(SUFFIX) zsymm.mkl : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strmm #################################################### strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strmm.acml : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -359,9 +569,12 @@ strmm.atlas : strmm.$(SUFFIX) strmm.mkl : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrmm.acml : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -372,10 +585,13 @@ dtrmm.atlas : dtrmm.$(SUFFIX) dtrmm.mkl : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrmm.acml : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -386,10 +602,13 @@ ctrmm.atlas : ctrmm.$(SUFFIX) ctrmm.mkl : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrmm.acml : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -400,10 +619,12 @@ ztrmm.atlas : ztrmm.$(SUFFIX) ztrmm.mkl : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strsm.acml : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -414,9 +635,12 @@ strsm.atlas : strsm.$(SUFFIX) strsm.mkl : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrsm.acml : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -427,10 +651,13 @@ dtrsm.atlas : dtrsm.$(SUFFIX) dtrsm.mkl : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrsm.acml : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -441,10 +668,13 @@ ctrsm.atlas : ctrsm.$(SUFFIX) ctrsm.mkl : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrsm.acml : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -455,9 +685,12 @@ ztrsm.atlas : ztrsm.$(SUFFIX) ztrsm.mkl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyrk.acml : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -468,9 +701,12 @@ ssyrk.atlas : ssyrk.$(SUFFIX) ssyrk.mkl : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsyrk #################################################### dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyrk.acml : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -481,10 +717,13 @@ dsyrk.atlas : dsyrk.$(SUFFIX) dsyrk.mkl : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csyrk #################################################### csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyrk.acml : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -495,10 +734,13 @@ csyrk.atlas : csyrk.$(SUFFIX) csyrk.mkl : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsyrk #################################################### zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyrk.acml : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -509,10 +751,12 @@ zsyrk.atlas : zsyrk.$(SUFFIX) zsyrk.mkl : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyr2k #################################################### ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyr2k.acml : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -523,9 +767,12 @@ ssyr2k.atlas : ssyr2k.$(SUFFIX) ssyr2k.mkl : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsyr2k #################################################### dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyr2k.acml : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -536,10 +783,13 @@ dsyr2k.atlas : dsyr2k.$(SUFFIX) dsyr2k.mkl : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csyr2k #################################################### csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyr2k.acml : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -550,10 +800,13 @@ csyr2k.atlas : csyr2k.$(SUFFIX) csyr2k.mkl : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zsyr2k #################################################### zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyr2k.acml : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -564,10 +817,13 @@ zsyr2k.atlas : zsyr2k.$(SUFFIX) zsyr2k.mkl : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Chemm #################################################### chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemm.acml : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -578,10 +834,13 @@ chemm.atlas : chemm.$(SUFFIX) chemm.mkl : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zhemm #################################################### zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemm.acml : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -592,10 +851,13 @@ zhemm.atlas : zhemm.$(SUFFIX) zhemm.mkl : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cherk #################################################### cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cherk.acml : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -606,10 +868,13 @@ cherk.atlas : cherk.$(SUFFIX) cherk.mkl : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zherk #################################################### zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zherk.acml : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -620,10 +885,13 @@ zherk.atlas : zherk.$(SUFFIX) zherk.mkl : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cher2k #################################################### cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cher2k.acml : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -634,10 +902,13 @@ cher2k.atlas : cher2k.$(SUFFIX) cher2k.mkl : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zher2k #################################################### zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zher2k.acml : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -648,9 +919,12 @@ zher2k.atlas : zher2k.$(SUFFIX) zher2k.mkl : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemv.acml : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -661,9 +935,12 @@ sgemv.atlas : sgemv.$(SUFFIX) sgemv.mkl : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemv #################################################### dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemv.acml : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -674,10 +951,13 @@ dgemv.atlas : dgemv.$(SUFFIX) dgemv.mkl : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemv #################################################### cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemv.acml : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -688,10 +968,13 @@ cgemv.atlas : cgemv.$(SUFFIX) cgemv.mkl : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemv #################################################### zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemv.acml : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -702,9 +985,12 @@ zgemv.atlas : zgemv.$(SUFFIX) zgemv.mkl : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sger.acml : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -715,9 +1001,12 @@ sger.atlas : sger.$(SUFFIX) sger.mkl : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dger #################################################### dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dger.acml : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -728,9 +1017,44 @@ dger.atlas : dger.$(SUFFIX) dger.mkl : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymv #################################################### ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymv.acml : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -741,9 +1065,12 @@ ssymv.atlas : ssymv.$(SUFFIX) ssymv.mkl : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymv #################################################### dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymv.acml : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -754,9 +1081,12 @@ dsymv.atlas : dsymv.$(SUFFIX) dsymv.mkl : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Csymv #################################################### csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymv.acml : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -767,9 +1097,12 @@ csymv.atlas : csymv.$(SUFFIX) csymv.mkl : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dsymv #################################################### zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymv.acml : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -780,9 +1113,12 @@ zsymv.atlas : zsymv.$(SUFFIX) zsymv.mkl : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -793,9 +1129,12 @@ sgeev.atlas : sgeev.$(SUFFIX) sgeev.mkl : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -806,10 +1145,13 @@ dgeev.atlas : dgeev.$(SUFFIX) dgeev.mkl : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -820,10 +1162,13 @@ cgeev.atlas : cgeev.$(SUFFIX) cgeev.mkl : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -834,10 +1179,12 @@ zgeev.atlas : zgeev.$(SUFFIX) zgeev.mkl : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -848,9 +1195,12 @@ sgetri.atlas : sgetri.$(SUFFIX) sgetri.mkl : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -861,10 +1211,13 @@ dgetri.atlas : dgetri.$(SUFFIX) dgetri.mkl : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -875,10 +1228,13 @@ cgetri.atlas : cgetri.$(SUFFIX) cgetri.mkl : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -889,10 +1245,12 @@ zgetri.atlas : zgetri.$(SUFFIX) zgetri.mkl : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Spotrf #################################################### spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm spotrf.acml : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -903,9 +1261,12 @@ spotrf.atlas : spotrf.$(SUFFIX) spotrf.mkl : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dpotrf #################################################### dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dpotrf.acml : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -916,10 +1277,13 @@ dpotrf.atlas : dpotrf.$(SUFFIX) dpotrf.mkl : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cpotrf #################################################### cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cpotrf.acml : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -930,10 +1294,13 @@ cpotrf.atlas : cpotrf.$(SUFFIX) cpotrf.mkl : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zpotrf #################################################### zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zpotrf.acml : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -944,10 +1311,13 @@ zpotrf.atlas : zpotrf.$(SUFFIX) zpotrf.mkl : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Chemv #################################################### chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemv.acml : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -958,10 +1328,13 @@ chemv.atlas : chemv.$(SUFFIX) chemv.mkl : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zhemv #################################################### zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemv.acml : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -972,9 +1345,12 @@ zhemv.atlas : zhemv.$(SUFFIX) zhemv.mkl : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sdot.acml : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -985,9 +1361,12 @@ sdot.atlas : sdot.$(SUFFIX) sdot.mkl : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ddot.acml : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -998,9 +1377,44 @@ ddot.atlas : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm saxpy.acml : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1011,9 +1425,12 @@ saxpy.atlas : saxpy.$(SUFFIX) saxpy.mkl : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Daxpy #################################################### daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm daxpy.acml : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1024,10 +1441,13 @@ daxpy.atlas : daxpy.$(SUFFIX) daxpy.mkl : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Caxpy #################################################### caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm caxpy.acml : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1038,10 +1458,13 @@ caxpy.atlas : caxpy.$(SUFFIX) caxpy.mkl : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zaxpy #################################################### zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zaxpy.acml : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1052,23 +1475,363 @@ zaxpy.atlas : zaxpy.$(SUFFIX) zaxpy.mkl : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm3m #################################################### cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm3m.mkl : cgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm3m #################################################### zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm3m.mkl : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1204,6 +1967,13 @@ sger.$(SUFFIX) : ger.c dger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + ssymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -1264,6 +2034,20 @@ sdot.$(SUFFIX) : dot.c ddot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + saxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -1276,6 +2060,72 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + + cgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ @@ -1284,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c clean :: - @rm -f *.goto *.mkl *.acml *.atlas + @rm -f *.goto *.mkl *.acml *.atlas *.veclib include $(TOPDIR)/Makefile.tail diff --git a/benchmark/asum.c b/benchmark/asum.c new file mode 100644 index 000000000..beb6402f4 --- /dev/null +++ b/benchmark/asum.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef ASUM + +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef COPY + +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef GESV +#undef GETRS + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GESV BLASFUNC(qgesv) +#elif defined(DOUBLE) +#define GESV BLASFUNC(dgesv) +#else +#define GESV BLASFUNC(sgesv) +#endif +#else +#ifdef XDOUBLE +#define GESV BLASFUNC(xgesv) +#elif defined(DOUBLE) +#define GESV BLASFUNC(zgesv) +#else +#define GESV BLASFUNC(cgesv) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *b; + blasint *ipiv; + + blasint m, i, j, info; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step){ + + fprintf(stderr, " %dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + b[i + j * m * COMPSIZE] = 0.0; + } + } + + + for (j = 0; j < m; ++j) { + for (i = 0; i < m * COMPSIZE; ++i) { + b[i] += a[i + j * m * COMPSIZE]; + } + } + + gettimeofday( &start, (struct timezone *)0); + + GESV (&m, &m, a, &m, ipiv, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + + + fprintf(stderr, + "%10.2f MFlops %10.6f s\n", + COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); + + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/plot-filter.sh b/benchmark/plot-filter.sh index 420ec9b02..73fbe97b6 100755 --- a/benchmark/plot-filter.sh +++ b/benchmark/plot-filter.sh @@ -52,6 +52,11 @@ C) awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2 ;; +B) + # Copy Benchmark + awk '/MBytes/ { print $1,int($3) }'|tail --lines=+2 + ;; + *) awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2 diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 3caf61caa..1d714549b 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -88,6 +88,10 @@ double fabs(double); #if defined(__WIN32__) || defined(__WIN64__) +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; diff --git a/benchmark/scal.c b/benchmark/scal.c new file mode 100644 index 000000000..4c2da4d30 --- /dev/null +++ b/benchmark/scal.c @@ -0,0 +1,202 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SCAL + +#ifdef COMPLEX +#ifdef DOUBLE +#define SCAL BLASFUNC(zscal) +#else +#define SCAL BLASFUNC(cscal) +#endif +#else +#ifdef DOUBLE +#define SCAL BLASFUNC(dscal) +#else +#define SCAL BLASFUNC(sscal) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + ev <- eigen(A) + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R new file mode 100755 index 000000000..f1c09c38d --- /dev/null +++ b/benchmark/scripts/R/dgemm.R @@ -0,0 +1,63 @@ +#!/usr/bin/Rscript + +argv <- commandArgs(trailingOnly = TRUE) + +nfrom = 128 +nto = 2048 +nstep = 128 +loops = 1 + +if ( length(argv) > 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + C <- A %*% B + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R new file mode 100755 index 000000000..6c6b77f70 --- /dev/null +++ b/benchmark/scripts/R/dsolve.R @@ -0,0 +1,63 @@ +#!/usr/bin/Rscript + +argv <- commandArgs(trailingOnly = TRUE) + +nfrom = 128 +nto = 2048 +nstep = 128 +loops = 1 + +if ( length(argv) > 0 ) { + + for ( z in 1:length(argv) ) { + + if ( z == 1 ) { + nfrom <- as.numeric(argv[z]) + } else if ( z==2 ) { + nto <- as.numeric(argv[z]) + } else if ( z==3 ) { + nstep <- as.numeric(argv[z]) + } else if ( z==4 ) { + loops <- as.numeric(argv[z]) + } + } + +} + +p=Sys.getenv("OPENBLAS_LOOPS") +if ( p != "" ) { + loops <- as.numeric(p) +} + + +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) +cat(sprintf(" SIZE Flops Time\n")) + +n = nfrom +while ( n <= nto ) { + + A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) + + l = 1 + + start <- proc.time()[3] + + while ( l <= loops ) { + + solve(A,B) + l = l + 1 + } + + end <- proc.time()[3] + timeg = end - start + mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) + + st = sprintf("%.0fx%.0f :",n , n) + cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) + + n = n + nstep + +} + + diff --git a/benchmark/swap.c b/benchmark/swap.c new file mode 100644 index 000000000..9f108ef50 --- /dev/null +++ b/benchmark/swap.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above swapright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above swapright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SWAP + +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag); + fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); @@ -171,28 +181,35 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - fprintf(stderr, " %6d : ", (int)m); + timeg=0.0; - for(j = 0; j < m; j++){ - for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } + fprintf(stderr, " %6d : ", (int)m); - gettimeofday( &start, (struct timezone *)0); + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#define RETURN_BY_STACK 1 +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT _Complex result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT _Complex result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l -#include "common.h" - -#ifdef __cplusplus -extern "C" { - /* Assume C declarations for C++ */ -#endif /* __cplusplus */ - -/*Set the number of threads on runtime.*/ -void openblas_set_num_threads(int num_threads); -void goto_set_num_threads(int num_threads); - -/*Get the number of threads on runtime.*/ -int openblas_get_num_threads(void); - -/*Get the number of physical processors (cores).*/ -int openblas_get_num_procs(void); - -/*Get the build configure on runtime.*/ -char* openblas_get_config(void); - -/* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); -/* OpenBLAS is compiled for sequential use */ -#define OPENBLAS_SEQUENTIAL 0 -/* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 -/* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 - - -#define CBLAS_INDEX size_t - -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; -typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; -typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; -typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; -typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - -float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); - -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); - -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); - -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); - -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); - -void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); - -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); - -void cblas_srotg(float *a, float *b, float *c, float *s); -void cblas_drotg(double *a, double *b, double *c, double *s); - -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); - -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); - -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); - -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); - -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); - -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); - -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); - -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); - - -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - - -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); - -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); - -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); - -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); - -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); -void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); - -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); - -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); - -void cblas_xerbla(blasint p, char *rout, char *form, ...); - -/*** BLAS extensions ***/ - -void cblas_saxpby(blasint n, float alpha, float *x, blasint incx,float beta, float *y, blasint incy); - -void cblas_daxpby(blasint n, double alpha, double *x, blasint incx,double beta, double *y, blasint incy); - -void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, float *y, blasint incy); - -void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy); - -void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, float *b, blasint cldb); -void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, double *b, blasint cldb); -void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); -void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); - -void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, blasint cldb); -void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, blasint cldb); -void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a, - blasint clda, blasint cldb); -void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, - blasint clda, blasint cldb); - -void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta, - float *c, blasint cldc); -void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta, - double *c, blasint cldc); -void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta, - float *c, blasint cldc); -void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta, - double *c, blasint cldc); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif diff --git a/cmake/arch.cmake b/cmake/arch.cmake new file mode 100644 index 000000000..d6fa3ed5d --- /dev/null +++ b/cmake/arch.cmake @@ -0,0 +1,115 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets various variables based on architecture. + +if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") + + if (${ARCH} STREQUAL "x86") + if (NOT BINARY) + set(NO_BINARY_MODE 1) + endif () + endif () + + if (NOT NO_EXPRECISION) + if (${F_COMPILER} MATCHES "GFORTRAN") + # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + set(EXPRECISION 1) + set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") + set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") + endif () + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") +endif () + +if (USE_OPENMP) + + if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Clang") + message(WARNING "Clang doesn't support OpenMP yet.") + set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "Intel") + set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + set(CEXTRALIB "${CEXTRALIB} -lstdc++") + endif () + + if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -mp") + endif () +endif () + + +if (DYNAMIC_ARCH) + if (${ARCH} STREQUAL "x86") + set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + endif () + + if (${ARCH} STREQUAL "x86_64") + set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") + if (NOT NO_AVX) + set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") + endif () + if (NOT NO_AVX2) + set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL") + endif () + endif () + + if (NOT DYNAMIC_CORE) + unset(DYNAMIC_ARCH) + endif () +endif () + +if (${ARCH} STREQUAL "ia64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) + + if (${F_COMPILER} MATCHES "GFORTRAN") + if (${CMAKE_C_COMPILER} STREQUAL "GNU") + # EXPRECISION = 1 + # CCOMMON_OPT += -DEXPRECISION + endif () + endif () +endif () + +if (${ARCH} STREQUAL "mips64") + set(NO_BINARY_MODE 1) +endif () + +if (${ARCH} STREQUAL "alpha") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + +if (${ARCH} STREQUAL "arm64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake new file mode 100644 index 000000000..89ec31446 --- /dev/null +++ b/cmake/c_check.cmake @@ -0,0 +1,89 @@ +## +## Author: Hank Anderson +## Description: Ported from the OpenBLAS/c_check perl script. +## This is triggered by prebuild.cmake and runs before any of the code is built. +## Creates config.h and Makefile.conf. + +# CMake vars set by this file: +# OSNAME (use CMAKE_SYSTEM_NAME) +# ARCH +# C_COMPILER (use CMAKE_C_COMPILER) +# BINARY32 +# BINARY64 +# FU +# CROSS_SUFFIX +# CROSS +# CEXTRALIB + +# Defines set by this file: +# OS_ +# ARCH_ +# C_ +# __32BIT__ +# __64BIT__ +# FUNDERSCORE +# PTHREAD_CREATE_FUNC + +# N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. +set(FU "") +if(APPLE) +set(FU "_") +elseif(MSVC) +set(FU "_") +elseif(UNIX) +set(FU "") +endif() + +# Convert CMake vars into the format that OpenBLAS expects +string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) +if (${HOST_OS} STREQUAL "WINDOWS") + set(HOST_OS WINNT) +endif () + +# added by hpa - check size of void ptr to detect 64-bit compile +if (NOT DEFINED BINARY) + set(BINARY 32) + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + set(BINARY 64) + endif () +endif () + +if (BINARY EQUAL 64) + set(BINARY64 1) +else () + set(BINARY32 1) +endif () + +# CMake docs define these: +# CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for. +# CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. +# +# TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check +set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) +if (${ARCH} STREQUAL "AMD64") + set(ARCH "x86_64") +endif () + +# If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong +if (${ARCH} STREQUAL "x86_64" AND BINARY EQUAL 32) + set(ARCH x86) +endif () + +if (${ARCH} STREQUAL "X86") + set(ARCH x86) +endif () + +set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +if (${COMPILER_ID} STREQUAL "GNU") + set(COMPILER_ID "GCC") +endif () + +string(TOUPPER ${ARCH} UC_ARCH) + +file(WRITE ${TARGET_CONF} + "#define OS_${HOST_OS}\t1\n" + "#define ARCH_${UC_ARCH}\t1\n" + "#define C_${COMPILER_ID}\t1\n" + "#define __${BINARY}BIT__\t1\n" + "#define FUNDERSCORE\t${FU}\n") + diff --git a/cmake/cc.cmake b/cmake/cc.cmake new file mode 100644 index 000000000..de196524f --- /dev/null +++ b/cmake/cc.cmake @@ -0,0 +1,103 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets C related variables. + +if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") + + set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") + set(COMMON_PROF "${COMMON_PROF} -fno-inline") + set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") + + if (QUIET_MAKE) + set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") + endif () + + if (NO_BINARY_MODE) + + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32") + endif () + set(BINARY_DEFINED 1) + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + + if (${OSNAME} STREQUAL "AIX") + set(BINARY_DEFINED 1) + endif () + endif () + + if (NOT BINARY_DEFINED) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PGI") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7") + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -n32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") + endif () + + else () + + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -m64") + endif () + endif () +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -w") + if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () +endif () + diff --git a/cmake/export.cmake b/cmake/export.cmake new file mode 100644 index 000000000..adf59101f --- /dev/null +++ b/cmake/export.cmake @@ -0,0 +1,60 @@ + +#Only generate .def for dll on MSVC +if(MSVC) + +set_source_files_properties(${OpenBLAS_DEF_FILE} PROPERTIES GENERATED 1) + +if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") +else() + set(ARCH_IN ${ARCH}) +endif() + +if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") +endif () + +if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) +else() + set(EXPRECISION_IN ${EXPRECISION}) +endif() + +if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) +else() + set(NO_CBLAS_IN ${NO_CBLAS}) +endif() + +if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) +else() + set(NO_LAPACK_IN ${NO_LAPACK}) +endif() + +if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) +else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) +endif() + +if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) +else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) +endif() + +if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) +else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) +endif() + +add_custom_command( + TARGET ${OpenBLAS_LIBNAME} PRE_LINK + COMMAND perl + ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" + COMMENT "Create openblas.def file" + VERBATIM) + +endif() \ No newline at end of file diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake new file mode 100644 index 000000000..e8fe4bfa7 --- /dev/null +++ b/cmake/f_check.cmake @@ -0,0 +1,66 @@ +## +## Author: Hank Anderson +## Copyright: (c) Stat-Ease, Inc. +## Created: 12/29/14 +## Last Modified: 12/29/14 +## Description: Ported from the OpenBLAS/f_check perl script. +## This is triggered by prebuild.cmake and runs before any of the code is built. +## Appends Fortran information to config.h and Makefile.conf. + +# CMake vars set by this file: +# F_COMPILER +# FC +# BU +# NOFORTRAN +# NEED2UNDERSCORES +# FEXTRALIB + +# Defines set by this file: +# BUNDERSCORE +# NEEDBUNDERSCORE +# NEED2UNDERSCORES + +if (MSVC) + # had to do this for MSVC, else CMake automatically assumes I have ifort... -hpa + include(CMakeForceCompiler) + CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) +endif () + +if (NOT NO_LAPACK) + enable_language(Fortran) +else() + include(CMakeForceCompiler) + CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) +endif() + +if (NOT ONLY_CBLAS) + # N.B. f_check is not cross-platform, so instead try to use CMake variables + # run f_check (appends to TARGET files) +# message(STATUS "Running f_check...") +# execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER} +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + + # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile + # TODO: set FEXTRALIB flags a la f_check? + + set(BU "_") + file(APPEND ${TARGET_CONF} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n" + "#define NEED2UNDERSCORES 0\n") + +else () + + #When we only build CBLAS, we set NOFORTRAN=2 + set(NOFORTRAN 2) + set(NO_FBLAS 1) + #set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler + set(BU "_") + file(APPEND ${TARGET_CONF} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") +endif() + +get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) +string(TOUPPER ${F_COMPILER} F_COMPILER) + diff --git a/cmake/fc.cmake b/cmake/fc.cmake new file mode 100644 index 000000000..ba156c210 --- /dev/null +++ b/cmake/fc.cmake @@ -0,0 +1,200 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Sets Fortran related variables. + +if (${F_COMPILER} STREQUAL "G77") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${F_COMPILER} STREQUAL "G95") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + if (NOT NO_BINARY_MODE) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () +endif () + +if (${F_COMPILER} STREQUAL "GFORTRAN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + if (NOT NO_LAPACK) + set(EXTRALIB "{EXTRALIB} -lgfortran") + endif () + if (NO_BINARY_MODE) + if (${ARCH} STREQUAL "mips64") + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "INTEL") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "FUJITSU") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "IBM") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") + # FCOMMON_OPT += -qarch=440 + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -q64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8") + endif () + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -q32") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "PGI") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") + set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "PATHSCALE") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (NOT ${ARCH} STREQUAL "mips64") + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () + + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "OPEN64") + + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + endif () + + if (${ARCH} STREQUAL "mips64") + + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -n32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -n64") + endif () + + if (${CORE} STREQUAL "LOONGSON3A") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + + if (${CORE} STREQUAL "LOONGSON3B") + set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") + endif () + else () + if (NOT BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + endif () + + if (USE_OPENMP) + set(FEXTRALIB "${FEXTRALIB} -lstdc++") + set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + endif () +endif () + +if (${F_COMPILER} STREQUAL "SUN") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") + if (${ARCH} STREQUAL "x86") + set(FCOMMON_OPT "${FCOMMON_OPT} -m32") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") + endif () + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") + endif () +endif () + +if (${F_COMPILER} STREQUAL "COMPAQ") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + +# from the root Makefile - this is for lapack-netlib to compile the correct secnd file. +if (${F_COMPILER} STREQUAL "GFORTRAN") + set(TIMER "INT_ETIME") +else () + set(TIMER "NONE") +endif () + diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake new file mode 100644 index 000000000..fad84de51 --- /dev/null +++ b/cmake/kernel.cmake @@ -0,0 +1,165 @@ +# helper functions for the kernel CMakeLists.txt + + +# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +macro(SetDefaultL1) + set(SAMAXKERNEL amax.S) + set(DAMAXKERNEL amax.S) + set(QAMAXKERNEL amax.S) + set(CAMAXKERNEL zamax.S) + set(ZAMAXKERNEL zamax.S) + set(XAMAXKERNEL zamax.S) + set(SAMINKERNEL amin.S) + set(DAMINKERNEL amin.S) + set(QAMINKERNEL amin.S) + set(CAMINKERNEL zamin.S) + set(ZAMINKERNEL zamin.S) + set(XAMINKERNEL zamin.S) + set(SMAXKERNEL max.S) + set(DMAXKERNEL max.S) + set(QMAXKERNEL max.S) + set(SMINKERNEL min.S) + set(DMINKERNEL min.S) + set(QMINKERNEL min.S) + set(ISAMAXKERNEL iamax.S) + set(IDAMAXKERNEL iamax.S) + set(IQAMAXKERNEL iamax.S) + set(ICAMAXKERNEL izamax.S) + set(IZAMAXKERNEL izamax.S) + set(IXAMAXKERNEL izamax.S) + set(ISAMINKERNEL iamin.S) + set(IDAMINKERNEL iamin.S) + set(IQAMINKERNEL iamin.S) + set(ICAMINKERNEL izamin.S) + set(IZAMINKERNEL izamin.S) + set(IXAMINKERNEL izamin.S) + set(ISMAXKERNEL iamax.S) + set(IDMAXKERNEL iamax.S) + set(IQMAXKERNEL iamax.S) + set(ISMINKERNEL iamin.S) + set(IDMINKERNEL iamin.S) + set(IQMINKERNEL iamin.S) + set(SASUMKERNEL asum.S) + set(DASUMKERNEL asum.S) + set(CASUMKERNEL zasum.S) + set(ZASUMKERNEL zasum.S) + set(QASUMKERNEL asum.S) + set(XASUMKERNEL zasum.S) + set(SAXPYKERNEL axpy.S) + set(DAXPYKERNEL axpy.S) + set(CAXPYKERNEL zaxpy.S) + set(ZAXPYKERNEL zaxpy.S) + set(QAXPYKERNEL axpy.S) + set(XAXPYKERNEL zaxpy.S) + set(SCOPYKERNEL copy.S) + set(DCOPYKERNEL copy.S) + set(CCOPYKERNEL zcopy.S) + set(ZCOPYKERNEL zcopy.S) + set(QCOPYKERNEL copy.S) + set(XCOPYKERNEL zcopy.S) + set(SDOTKERNEL dot.S) + set(DDOTKERNEL dot.S) + set(CDOTKERNEL zdot.S) + set(ZDOTKERNEL zdot.S) + set(QDOTKERNEL dot.S) + set(XDOTKERNEL zdot.S) + set(SNRM2KERNEL nrm2.S) + set(DNRM2KERNEL nrm2.S) + set(QNRM2KERNEL nrm2.S) + set(CNRM2KERNEL znrm2.S) + set(ZNRM2KERNEL znrm2.S) + set(XNRM2KERNEL znrm2.S) + set(SROTKERNEL rot.S) + set(DROTKERNEL rot.S) + set(QROTKERNEL rot.S) + set(CROTKERNEL zrot.S) + set(ZROTKERNEL zrot.S) + set(XROTKERNEL zrot.S) + set(SSCALKERNEL scal.S) + set(DSCALKERNEL scal.S) + set(CSCALKERNEL zscal.S) + set(ZSCALKERNEL zscal.S) + set(QSCALKERNEL scal.S) + set(XSCALKERNEL zscal.S) + set(SSWAPKERNEL swap.S) + set(DSWAPKERNEL swap.S) + set(CSWAPKERNEL zswap.S) + set(ZSWAPKERNEL zswap.S) + set(QSWAPKERNEL swap.S) + set(XSWAPKERNEL zswap.S) + set(SGEMVNKERNEL gemv_n.S) + set(SGEMVTKERNEL gemv_t.S) + set(DGEMVNKERNEL gemv_n.S) + set(DGEMVTKERNEL gemv_t.S) + set(CGEMVNKERNEL zgemv_n.S) + set(CGEMVTKERNEL zgemv_t.S) + set(ZGEMVNKERNEL zgemv_n.S) + set(ZGEMVTKERNEL zgemv_t.S) + set(QGEMVNKERNEL gemv_n.S) + set(QGEMVTKERNEL gemv_t.S) + set(XGEMVNKERNEL zgemv_n.S) + set(XGEMVTKERNEL zgemv_t.S) + set(SCABS_KERNEL ../generic/cabs.c) + set(DCABS_KERNEL ../generic/cabs.c) + set(QCABS_KERNEL ../generic/cabs.c) + set(LSAME_KERNEL ../generic/lsame.c) + set(SAXPBYKERNEL ../arm/axpby.c) + set(DAXPBYKERNEL ../arm/axpby.c) + set(CAXPBYKERNEL ../arm/zaxpby.c) + set(ZAXPBYKERNEL ../arm/zaxpby.c) +endmacro () + +macro(SetDefaultL2) + set(SGEMVNKERNEL gemv_n.S) + set(SGEMVTKERNEL gemv_t.S) + set(DGEMVNKERNEL gemv_n.S) + set(DGEMVTKERNEL gemv_t.S) + set(CGEMVNKERNEL zgemv_n.S) + set(CGEMVTKERNEL zgemv_t.S) + set(ZGEMVNKERNEL zgemv_n.S) + set(ZGEMVTKERNEL zgemv_t.S) + set(QGEMVNKERNEL gemv_n.S) + set(QGEMVTKERNEL gemv_t.S) + set(XGEMVNKERNEL zgemv_n.S) + set(XGEMVTKERNEL zgemv_t.S) + set(SGERKERNEL ../generic/ger.c) + set(DGERKERNEL ../generic/ger.c) + set(QGERKERNEL ../generic/ger.c) + set(CGERUKERNEL ../generic/zger.c) + set(CGERCKERNEL ../generic/zger.c) + set(ZGERUKERNEL ../generic/zger.c) + set(ZGERCKERNEL ../generic/zger.c) + set(XGERUKERNEL ../generic/zger.c) + set(XGERCKERNEL ../generic/zger.c) + set(SSYMV_U_KERNEL ../generic/symv_k.c) + set(SSYMV_L_KERNEL ../generic/symv_k.c) + set(DSYMV_U_KERNEL ../generic/symv_k.c) + set(DSYMV_L_KERNEL ../generic/symv_k.c) + set(QSYMV_U_KERNEL ../generic/symv_k.c) + set(QSYMV_L_KERNEL ../generic/symv_k.c) + set(CSYMV_U_KERNEL ../generic/zsymv_k.c) + set(CSYMV_L_KERNEL ../generic/zsymv_k.c) + set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) + set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) + set(XSYMV_U_KERNEL ../generic/zsymv_k.c) + set(XSYMV_L_KERNEL ../generic/zsymv_k.c) + set(CHEMV_U_KERNEL ../generic/zhemv_k.c) + set(CHEMV_L_KERNEL ../generic/zhemv_k.c) + set(CHEMV_V_KERNEL ../generic/zhemv_k.c) + set(CHEMV_M_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) + set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) + set(XHEMV_U_KERNEL ../generic/zhemv_k.c) + set(XHEMV_L_KERNEL ../generic/zhemv_k.c) + set(XHEMV_V_KERNEL ../generic/zhemv_k.c) + set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +endmacro () + +macro(SetDefaultL3) + set(SGEADD_KERNEL ../generic/geadd.c) + set(DGEADD_KERNEL ../generic/geadd.c) + set(CGEADD_KERNEL ../generic/zgeadd.c) + set(ZGEADD_KERNEL ../generic/zgeadd.c) +endmacro () \ No newline at end of file diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake new file mode 100644 index 000000000..3e81611ab --- /dev/null +++ b/cmake/lapack.cmake @@ -0,0 +1,347 @@ +# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. + +set(ALLAUX + ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ../INSTALL/ilaver.f ../INSTALL/slamch.f +) + +set(SCLAUX + sbdsdc.f + sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f + slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f + slagts.f slamrg.f slanst.f + slapy2.f slapy3.f slarnv.f + slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f + slarrk.f slarrr.f slaneg.f + slartg.f slaruv.f slas2.f slascl.f + slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f + slasd7.f slasd8.f slasda.f slasdq.f slasdt.f + slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f + slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f + ssteqr.f ssterf.f slaisnan.f sisnan.f + slartgp.f slartgs.f + ../INSTALL/second_${TIMER}.f +) + +set(DZLAUX + dbdsdc.f + dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f + dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f + dlagts.f dlamrg.f dlanst.f + dlapy2.f dlapy3.f dlarnv.f + dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f + dlarrk.f dlarrr.f dlaneg.f + dlartg.f dlaruv.f dlas2.f dlascl.f + dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f + dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f + dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f + dsteqr.f dsterf.f dlaisnan.f disnan.f + dlartgp.f dlartgs.f + ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f +) + +set(SLASRC + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f + sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f + sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f + sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f + sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f + sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f + sgetc2.f sgetri.f + sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f + sggglm.f sgghrd.f sgglse.f sggqrf.f + sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f + sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f + shsein.f shseqr.f slabrd.f slacon.f slacn2.f + slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f + slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f + slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f + slansy.f slantb.f slantp.f slantr.f slanv2.f + slapll.f slapmt.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f + slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f + slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f + slarrv.f slartv.f + slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f + sopgtr.f sopmtr.f sorg2l.f sorg2r.f + sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f + sorgrq.f sorgtr.f sorm2l.f sorm2r.f + sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f + sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f + spbstf.f spbsv.f spbsvx.f + spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f + sposvx.f spstrf.f spstf2.f + sppcon.f sppequ.f + spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f + spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f + ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f + ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f + sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f + sstevx.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f + ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f + ssyswapr.f ssytrs.f ssytrs2.f ssyconv.f + ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f + ssytri_rook.f ssycon_rook.f ssysv_rook.f + stbcon.f + stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f + stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f + stptrs.f + strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f + strtrs.f stzrqf.f stzrzf.f sstemr.f + slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f + stfttr.f stpttf.f stpttr.f strttf.f strttp.f + sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f + sgeequb.f ssyequb.f spoequb.f sgbequb.f + sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f + sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f + sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f + stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f +) + +set(DSLASRC spotrs.f) + +set(CLASRC + cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f + cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f + cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f + cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f + cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f + cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f + cgesvx.f cgetc2.f cgetri.f + cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f + cgghrd.f cgglse.f cggqrf.f cggrqf.f + cggsvd.f cggsvp.f + cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f + chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f + checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f + chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f + chetf2.f chetrd.f + chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f + chetrs.f chetrs2.f + chetf2_rook.f chetrf_rook.f chetri_rook.f chetrs_rook.f checon_rook.f chesv_rook.f + chgeqz.f chpcon.f chpev.f chpevd.f + chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f + chpsvx.f + chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f + clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f + claed0.f claed7.f claed8.f + claein.f claesy.f claev2.f clags2.f clagtm.f + clahef.f clahef_rook.f clahqr.f + clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f + clanhb.f clanhe.f + clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f + clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f + claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f + clarf.f clarfb.f clarfg.f clarft.f clarfgp.f + clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f + clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f + clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f + clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f + cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f + cposv.f cposvx.f cpstrf.f cpstf2.f + cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f + cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f + crot.f cspcon.f csprfs.f cspsv.f + cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f + cstegr.f cstein.f csteqr.f + csycon.f + csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f csytri2.f csytri2x.f + csyswapr.f csytrs.f csytrs2.f csyconv.f + csytf2_rook.f csytrf_rook.f csytrs_rook.f + csytri_rook.f csycon_rook.f csysv_rook.f + ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f + ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f + ctprfs.f ctptri.f + ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f + ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f + cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f + cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f + cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f + cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f + chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f + ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f + cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f + cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f + cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f + cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f + ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f +) + +set(ZCLASRC cpotrs.f) + +set(DLASRC + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f + dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f + dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f + dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f + dgetc2.f dgetri.f + dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f + dggglm.f dgghrd.f dgglse.f dggqrf.f + dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f + dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f + dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f + dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f + dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f + dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f + dlapll.f dlapmt.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f + dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f + dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f + dlargv.f dlarrv.f dlartv.f + dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f + dopgtr.f dopmtr.f dorg2l.f dorg2r.f + dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgrq.f dorgtr.f dorm2l.f dorm2r.f + dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f + dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f + dpbstf.f dpbsv.f dpbsvx.f + dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f + dposvx.f dpotrs.f dpstrf.f dpstf2.f + dppcon.f dppequ.f + dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f + dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f + dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f + dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f + dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f + dstevx.f + dsycon.f dsyev.f dsyevd.f dsyevr.f + dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f + dsysv.f dsysvx.f + dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f + dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f + dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f + dsytri_rook.f dsycon_rook.f dsysv_rook.f + dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f + dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f + dtptrs.f + dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f + dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f + dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f + dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f + dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f + dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f + dgeequb.f dsyequb.f dpoequb.f dgbequb.f + dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f + dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f + dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f + dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f +) + +set(ZLASRC + zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f + zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f + zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f + zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f + zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f + zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f + zgetri.f + zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f + zgghrd.f zgglse.f zggqrf.f zggrqf.f + zggsvd.f zggsvp.f + zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f + zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f + zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f + zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f + zhetf2.f zhetrd.f + zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f + zhetrs.f zhetrs2.f + zhetf2_rook.f zhetrf_rook.f zhetri_rook.f zhetrs_rook.f zhecon_rook.f zhesv_rook.f + zhgeqz.f zhpcon.f zhpev.f zhpevd.f + zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f + zhpsvx.f + zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f + zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f + zlaed0.f zlaed7.f zlaed8.f + zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f + zlahef.f zlahef_rook.f zlahqr.f + zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f + zlangt.f zlanhb.f + zlanhe.f + zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f + zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f + zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f + zlarcm.f zlarf.f zlarfb.f + zlarfg.f zlarft.f zlarfgp.f + zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f + zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f + zlassq.f zlasyf.f zlasyf_rook.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f + zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f + zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f + zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f + zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f + zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f + zrot.f zspcon.f zsprfs.f zspsv.f + zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f + zstegr.f zstein.f zsteqr.f + zsycon.f + zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f + zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f + zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f + zsytri_rook.f zsycon_rook.f zsysv_rook.f + ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f + ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f + ztprfs.f ztptri.f + ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f + ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f + zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f + zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f + zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f + zunmtr.f zupgtr.f + zupmtr.f izmax1.f dzsum1.f zstemr.f + zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f + zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f + ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f + zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f + zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f + zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f + zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f + ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f +) + +set(LA_REL_SRC ${ALLAUX}) +if (BUILD_SINGLE) + list(APPEND LA_REL_SRC ${SLASRC} ${DSLASRC} ${SCLAUX}) +endif () + +if (BUILD_DOUBLE) + list(APPEND LA_REL_SRC ${DLASRC} ${DSLASRC} ${DZLAUX}) +endif () + +if (BUILD_COMPLEX) + list(APPEND LA_REL_SRC ${CLASRC} ${ZCLASRC} ${SCLAUX}) +endif () + +if (BUILD_COMPLEX16) + list(APPEND LA_REL_SRC ${ZLASRC} ${ZCLASRC} ${DZLAUX}) +endif () + +# add lapack-netlib folder to the sources +set(LA_SOURCES "") +foreach (LA_FILE ${LA_REL_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") +endforeach () +set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake new file mode 100644 index 000000000..39ade0577 --- /dev/null +++ b/cmake/lapacke.cmake @@ -0,0 +1,2067 @@ + +set(C_SRC + lapacke_cbbcsd.c + lapacke_cbbcsd_work.c + lapacke_cbdsqr.c + lapacke_cbdsqr_work.c + lapacke_cgbbrd.c + lapacke_cgbbrd_work.c + lapacke_cgbcon.c + lapacke_cgbcon_work.c + lapacke_cgbequ.c + lapacke_cgbequ_work.c + lapacke_cgbequb.c + lapacke_cgbequb_work.c + lapacke_cgbrfs.c + lapacke_cgbrfs_work.c + lapacke_cgbsv.c + lapacke_cgbsv_work.c + lapacke_cgbsvx.c + lapacke_cgbsvx_work.c + lapacke_cgbtrf.c + lapacke_cgbtrf_work.c + lapacke_cgbtrs.c + lapacke_cgbtrs_work.c + lapacke_cgebak.c + lapacke_cgebak_work.c + lapacke_cgebal.c + lapacke_cgebal_work.c + lapacke_cgebrd.c + lapacke_cgebrd_work.c + lapacke_cgecon.c + lapacke_cgecon_work.c + lapacke_cgeequ.c + lapacke_cgeequ_work.c + lapacke_cgeequb.c + lapacke_cgeequb_work.c + lapacke_cgees.c + lapacke_cgees_work.c + lapacke_cgeesx.c + lapacke_cgeesx_work.c + lapacke_cgeev.c + lapacke_cgeev_work.c + lapacke_cgeevx.c + lapacke_cgeevx_work.c + lapacke_cgehrd.c + lapacke_cgehrd_work.c + lapacke_cgelq2.c + lapacke_cgelq2_work.c + lapacke_cgelqf.c + lapacke_cgelqf_work.c + lapacke_cgels.c + lapacke_cgels_work.c + lapacke_cgelsd.c + lapacke_cgelsd_work.c + lapacke_cgelss.c + lapacke_cgelss_work.c + lapacke_cgelsy.c + lapacke_cgelsy_work.c + lapacke_cgemqrt.c + lapacke_cgemqrt_work.c + lapacke_cgeqlf.c + lapacke_cgeqlf_work.c + lapacke_cgeqp3.c + lapacke_cgeqp3_work.c + lapacke_cgeqpf.c + lapacke_cgeqpf_work.c + lapacke_cgeqr2.c + lapacke_cgeqr2_work.c + lapacke_cgeqrf.c + lapacke_cgeqrf_work.c + lapacke_cgeqrfp.c + lapacke_cgeqrfp_work.c + lapacke_cgeqrt.c + lapacke_cgeqrt2.c + lapacke_cgeqrt2_work.c + lapacke_cgeqrt3.c + lapacke_cgeqrt3_work.c + lapacke_cgeqrt_work.c + lapacke_cgerfs.c + lapacke_cgerfs_work.c + lapacke_cgerqf.c + lapacke_cgerqf_work.c + lapacke_cgesdd.c + lapacke_cgesdd_work.c + lapacke_cgesv.c + lapacke_cgesv_work.c + lapacke_cgesvd.c + lapacke_cgesvd_work.c + lapacke_cgesvx.c + lapacke_cgesvx_work.c + lapacke_cgetf2.c + lapacke_cgetf2_work.c + lapacke_cgetrf.c + lapacke_cgetrf_work.c + lapacke_cgetri.c + lapacke_cgetri_work.c + lapacke_cgetrs.c + lapacke_cgetrs_work.c + lapacke_cggbak.c + lapacke_cggbak_work.c + lapacke_cggbal.c + lapacke_cggbal_work.c + lapacke_cgges.c + lapacke_cgges_work.c + lapacke_cggesx.c + lapacke_cggesx_work.c + lapacke_cggev.c + lapacke_cggev_work.c + lapacke_cggevx.c + lapacke_cggevx_work.c + lapacke_cggglm.c + lapacke_cggglm_work.c + lapacke_cgghrd.c + lapacke_cgghrd_work.c + lapacke_cgglse.c + lapacke_cgglse_work.c + lapacke_cggqrf.c + lapacke_cggqrf_work.c + lapacke_cggrqf.c + lapacke_cggrqf_work.c + lapacke_cggsvd.c + lapacke_cggsvd_work.c + lapacke_cggsvp.c + lapacke_cggsvp_work.c + lapacke_cgtcon.c + lapacke_cgtcon_work.c + lapacke_cgtrfs.c + lapacke_cgtrfs_work.c + lapacke_cgtsv.c + lapacke_cgtsv_work.c + lapacke_cgtsvx.c + lapacke_cgtsvx_work.c + lapacke_cgttrf.c + lapacke_cgttrf_work.c + lapacke_cgttrs.c + lapacke_cgttrs_work.c + lapacke_chbev.c + lapacke_chbev_work.c + lapacke_chbevd.c + lapacke_chbevd_work.c + lapacke_chbevx.c + lapacke_chbevx_work.c + lapacke_chbgst.c + lapacke_chbgst_work.c + lapacke_chbgv.c + lapacke_chbgv_work.c + lapacke_chbgvd.c + lapacke_chbgvd_work.c + lapacke_chbgvx.c + lapacke_chbgvx_work.c + lapacke_chbtrd.c + lapacke_chbtrd_work.c + lapacke_checon.c + lapacke_checon_work.c + lapacke_cheequb.c + lapacke_cheequb_work.c + lapacke_cheev.c + lapacke_cheev_work.c + lapacke_cheevd.c + lapacke_cheevd_work.c + lapacke_cheevr.c + lapacke_cheevr_work.c + lapacke_cheevx.c + lapacke_cheevx_work.c + lapacke_chegst.c + lapacke_chegst_work.c + lapacke_chegv.c + lapacke_chegv_work.c + lapacke_chegvd.c + lapacke_chegvd_work.c + lapacke_chegvx.c + lapacke_chegvx_work.c + lapacke_cherfs.c + lapacke_cherfs_work.c + lapacke_chesv.c + lapacke_chesv_work.c + lapacke_chesvx.c + lapacke_chesvx_work.c + lapacke_cheswapr.c + lapacke_cheswapr_work.c + lapacke_chetrd.c + lapacke_chetrd_work.c + lapacke_chetrf.c + lapacke_chetrf_work.c + lapacke_chetri.c + lapacke_chetri2.c + lapacke_chetri2_work.c + lapacke_chetri2x.c + lapacke_chetri2x_work.c + lapacke_chetri_work.c + lapacke_chetrs.c + lapacke_chetrs2.c + lapacke_chetrs2_work.c + lapacke_chetrs_work.c + lapacke_chfrk.c + lapacke_chfrk_work.c + lapacke_chgeqz.c + lapacke_chgeqz_work.c + lapacke_chpcon.c + lapacke_chpcon_work.c + lapacke_chpev.c + lapacke_chpev_work.c + lapacke_chpevd.c + lapacke_chpevd_work.c + lapacke_chpevx.c + lapacke_chpevx_work.c + lapacke_chpgst.c + lapacke_chpgst_work.c + lapacke_chpgv.c + lapacke_chpgv_work.c + lapacke_chpgvd.c + lapacke_chpgvd_work.c + lapacke_chpgvx.c + lapacke_chpgvx_work.c + lapacke_chprfs.c + lapacke_chprfs_work.c + lapacke_chpsv.c + lapacke_chpsv_work.c + lapacke_chpsvx.c + lapacke_chpsvx_work.c + lapacke_chptrd.c + lapacke_chptrd_work.c + lapacke_chptrf.c + lapacke_chptrf_work.c + lapacke_chptri.c + lapacke_chptri_work.c + lapacke_chptrs.c + lapacke_chptrs_work.c + lapacke_chsein.c + lapacke_chsein_work.c + lapacke_chseqr.c + lapacke_chseqr_work.c + lapacke_clacgv.c + lapacke_clacgv_work.c + lapacke_clacn2.c + lapacke_clacn2_work.c + lapacke_clacp2.c + lapacke_clacp2_work.c + lapacke_clacpy.c + lapacke_clacpy_work.c + lapacke_clag2z.c + lapacke_clag2z_work.c + lapacke_clange.c + lapacke_clange_work.c + lapacke_clanhe.c + lapacke_clanhe_work.c + lapacke_clansy.c + lapacke_clansy_work.c + lapacke_clantr.c + lapacke_clantr_work.c + lapacke_clapmr.c + lapacke_clapmr_work.c + lapacke_clarfb.c + lapacke_clarfb_work.c + lapacke_clarfg.c + lapacke_clarfg_work.c + lapacke_clarft.c + lapacke_clarft_work.c + lapacke_clarfx.c + lapacke_clarfx_work.c + lapacke_clarnv.c + lapacke_clarnv_work.c + lapacke_claset.c + lapacke_claset_work.c + lapacke_claswp.c + lapacke_claswp_work.c + lapacke_clauum.c + lapacke_clauum_work.c + lapacke_cpbcon.c + lapacke_cpbcon_work.c + lapacke_cpbequ.c + lapacke_cpbequ_work.c + lapacke_cpbrfs.c + lapacke_cpbrfs_work.c + lapacke_cpbstf.c + lapacke_cpbstf_work.c + lapacke_cpbsv.c + lapacke_cpbsv_work.c + lapacke_cpbsvx.c + lapacke_cpbsvx_work.c + lapacke_cpbtrf.c + lapacke_cpbtrf_work.c + lapacke_cpbtrs.c + lapacke_cpbtrs_work.c + lapacke_cpftrf.c + lapacke_cpftrf_work.c + lapacke_cpftri.c + lapacke_cpftri_work.c + lapacke_cpftrs.c + lapacke_cpftrs_work.c + lapacke_cpocon.c + lapacke_cpocon_work.c + lapacke_cpoequ.c + lapacke_cpoequ_work.c + lapacke_cpoequb.c + lapacke_cpoequb_work.c + lapacke_cporfs.c + lapacke_cporfs_work.c + lapacke_cposv.c + lapacke_cposv_work.c + lapacke_cposvx.c + lapacke_cposvx_work.c + lapacke_cpotrf.c + lapacke_cpotrf_work.c + lapacke_cpotri.c + lapacke_cpotri_work.c + lapacke_cpotrs.c + lapacke_cpotrs_work.c + lapacke_cppcon.c + lapacke_cppcon_work.c + lapacke_cppequ.c + lapacke_cppequ_work.c + lapacke_cpprfs.c + lapacke_cpprfs_work.c + lapacke_cppsv.c + lapacke_cppsv_work.c + lapacke_cppsvx.c + lapacke_cppsvx_work.c + lapacke_cpptrf.c + lapacke_cpptrf_work.c + lapacke_cpptri.c + lapacke_cpptri_work.c + lapacke_cpptrs.c + lapacke_cpptrs_work.c + lapacke_cpstrf.c + lapacke_cpstrf_work.c + lapacke_cptcon.c + lapacke_cptcon_work.c + lapacke_cpteqr.c + lapacke_cpteqr_work.c + lapacke_cptrfs.c + lapacke_cptrfs_work.c + lapacke_cptsv.c + lapacke_cptsv_work.c + lapacke_cptsvx.c + lapacke_cptsvx_work.c + lapacke_cpttrf.c + lapacke_cpttrf_work.c + lapacke_cpttrs.c + lapacke_cpttrs_work.c + lapacke_cspcon.c + lapacke_cspcon_work.c + lapacke_csprfs.c + lapacke_csprfs_work.c + lapacke_cspsv.c + lapacke_cspsv_work.c + lapacke_cspsvx.c + lapacke_cspsvx_work.c + lapacke_csptrf.c + lapacke_csptrf_work.c + lapacke_csptri.c + lapacke_csptri_work.c + lapacke_csptrs.c + lapacke_csptrs_work.c + lapacke_cstedc.c + lapacke_cstedc_work.c + lapacke_cstegr.c + lapacke_cstegr_work.c + lapacke_cstein.c + lapacke_cstein_work.c + lapacke_cstemr.c + lapacke_cstemr_work.c + lapacke_csteqr.c + lapacke_csteqr_work.c + lapacke_csycon.c + lapacke_csycon_work.c + lapacke_csyconv.c + lapacke_csyconv_work.c + lapacke_csyequb.c + lapacke_csyequb_work.c + lapacke_csyrfs.c + lapacke_csyrfs_work.c + lapacke_csysv.c + lapacke_csysv_rook.c + lapacke_csysv_rook_work.c + lapacke_csysv_work.c + lapacke_csysvx.c + lapacke_csysvx_work.c + lapacke_csyswapr.c + lapacke_csyswapr_work.c + lapacke_csytrf.c + lapacke_csytrf_work.c + lapacke_csytri.c + lapacke_csytri2.c + lapacke_csytri2_work.c + lapacke_csytri2x.c + lapacke_csytri2x_work.c + lapacke_csytri_work.c + lapacke_csytrs.c + lapacke_csytrs2.c + lapacke_csytrs2_work.c + lapacke_csytrs_work.c + lapacke_ctbcon.c + lapacke_ctbcon_work.c + lapacke_ctbrfs.c + lapacke_ctbrfs_work.c + lapacke_ctbtrs.c + lapacke_ctbtrs_work.c + lapacke_ctfsm.c + lapacke_ctfsm_work.c + lapacke_ctftri.c + lapacke_ctftri_work.c + lapacke_ctfttp.c + lapacke_ctfttp_work.c + lapacke_ctfttr.c + lapacke_ctfttr_work.c + lapacke_ctgevc.c + lapacke_ctgevc_work.c + lapacke_ctgexc.c + lapacke_ctgexc_work.c + lapacke_ctgsen.c + lapacke_ctgsen_work.c + lapacke_ctgsja.c + lapacke_ctgsja_work.c + lapacke_ctgsna.c + lapacke_ctgsna_work.c + lapacke_ctgsyl.c + lapacke_ctgsyl_work.c + lapacke_ctpcon.c + lapacke_ctpcon_work.c + lapacke_ctpmqrt.c + lapacke_ctpmqrt_work.c + lapacke_ctpqrt.c + lapacke_ctpqrt2.c + lapacke_ctpqrt2_work.c + lapacke_ctpqrt_work.c + lapacke_ctprfb.c + lapacke_ctprfb_work.c + lapacke_ctprfs.c + lapacke_ctprfs_work.c + lapacke_ctptri.c + lapacke_ctptri_work.c + lapacke_ctptrs.c + lapacke_ctptrs_work.c + lapacke_ctpttf.c + lapacke_ctpttf_work.c + lapacke_ctpttr.c + lapacke_ctpttr_work.c + lapacke_ctrcon.c + lapacke_ctrcon_work.c + lapacke_ctrevc.c + lapacke_ctrevc_work.c + lapacke_ctrexc.c + lapacke_ctrexc_work.c + lapacke_ctrrfs.c + lapacke_ctrrfs_work.c + lapacke_ctrsen.c + lapacke_ctrsen_work.c + lapacke_ctrsna.c + lapacke_ctrsna_work.c + lapacke_ctrsyl.c + lapacke_ctrsyl_work.c + lapacke_ctrtri.c + lapacke_ctrtri_work.c + lapacke_ctrtrs.c + lapacke_ctrtrs_work.c + lapacke_ctrttf.c + lapacke_ctrttf_work.c + lapacke_ctrttp.c + lapacke_ctrttp_work.c + lapacke_ctzrzf.c + lapacke_ctzrzf_work.c + lapacke_cunbdb.c + lapacke_cunbdb_work.c + lapacke_cuncsd.c + lapacke_cuncsd_work.c + lapacke_cungbr.c + lapacke_cungbr_work.c + lapacke_cunghr.c + lapacke_cunghr_work.c + lapacke_cunglq.c + lapacke_cunglq_work.c + lapacke_cungql.c + lapacke_cungql_work.c + lapacke_cungqr.c + lapacke_cungqr_work.c + lapacke_cungrq.c + lapacke_cungrq_work.c + lapacke_cungtr.c + lapacke_cungtr_work.c + lapacke_cunmbr.c + lapacke_cunmbr_work.c + lapacke_cunmhr.c + lapacke_cunmhr_work.c + lapacke_cunmlq.c + lapacke_cunmlq_work.c + lapacke_cunmql.c + lapacke_cunmql_work.c + lapacke_cunmqr.c + lapacke_cunmqr_work.c + lapacke_cunmrq.c + lapacke_cunmrq_work.c + lapacke_cunmrz.c + lapacke_cunmrz_work.c + lapacke_cunmtr.c + lapacke_cunmtr_work.c + lapacke_cupgtr.c + lapacke_cupgtr_work.c + lapacke_cupmtr.c + lapacke_cupmtr_work.c +) + +set(DSRC + lapacke_dbbcsd.c + lapacke_dbbcsd_work.c + lapacke_dbdsdc.c + lapacke_dbdsdc_work.c + lapacke_dbdsqr.c + lapacke_dbdsqr_work.c + lapacke_ddisna.c + lapacke_ddisna_work.c + lapacke_dgbbrd.c + lapacke_dgbbrd_work.c + lapacke_dgbcon.c + lapacke_dgbcon_work.c + lapacke_dgbequ.c + lapacke_dgbequ_work.c + lapacke_dgbequb.c + lapacke_dgbequb_work.c + lapacke_dgbrfs.c + lapacke_dgbrfs_work.c + lapacke_dgbsv.c + lapacke_dgbsv_work.c + lapacke_dgbsvx.c + lapacke_dgbsvx_work.c + lapacke_dgbtrf.c + lapacke_dgbtrf_work.c + lapacke_dgbtrs.c + lapacke_dgbtrs_work.c + lapacke_dgebak.c + lapacke_dgebak_work.c + lapacke_dgebal.c + lapacke_dgebal_work.c + lapacke_dgebrd.c + lapacke_dgebrd_work.c + lapacke_dgecon.c + lapacke_dgecon_work.c + lapacke_dgeequ.c + lapacke_dgeequ_work.c + lapacke_dgeequb.c + lapacke_dgeequb_work.c + lapacke_dgees.c + lapacke_dgees_work.c + lapacke_dgeesx.c + lapacke_dgeesx_work.c + lapacke_dgeev.c + lapacke_dgeev_work.c + lapacke_dgeevx.c + lapacke_dgeevx_work.c + lapacke_dgehrd.c + lapacke_dgehrd_work.c + lapacke_dgejsv.c + lapacke_dgejsv_work.c + lapacke_dgelq2.c + lapacke_dgelq2_work.c + lapacke_dgelqf.c + lapacke_dgelqf_work.c + lapacke_dgels.c + lapacke_dgels_work.c + lapacke_dgelsd.c + lapacke_dgelsd_work.c + lapacke_dgelss.c + lapacke_dgelss_work.c + lapacke_dgelsy.c + lapacke_dgelsy_work.c + lapacke_dgemqrt.c + lapacke_dgemqrt_work.c + lapacke_dgeqlf.c + lapacke_dgeqlf_work.c + lapacke_dgeqp3.c + lapacke_dgeqp3_work.c + lapacke_dgeqpf.c + lapacke_dgeqpf_work.c + lapacke_dgeqr2.c + lapacke_dgeqr2_work.c + lapacke_dgeqrf.c + lapacke_dgeqrf_work.c + lapacke_dgeqrfp.c + lapacke_dgeqrfp_work.c + lapacke_dgeqrt.c + lapacke_dgeqrt2.c + lapacke_dgeqrt2_work.c + lapacke_dgeqrt3.c + lapacke_dgeqrt3_work.c + lapacke_dgeqrt_work.c + lapacke_dgerfs.c + lapacke_dgerfs_work.c + lapacke_dgerqf.c + lapacke_dgerqf_work.c + lapacke_dgesdd.c + lapacke_dgesdd_work.c + lapacke_dgesv.c + lapacke_dgesv_work.c + lapacke_dgesvd.c + lapacke_dgesvd_work.c + lapacke_dgesvj.c + lapacke_dgesvj_work.c + lapacke_dgesvx.c + lapacke_dgesvx_work.c + lapacke_dgetf2.c + lapacke_dgetf2_work.c + lapacke_dgetrf.c + lapacke_dgetrf_work.c + lapacke_dgetri.c + lapacke_dgetri_work.c + lapacke_dgetrs.c + lapacke_dgetrs_work.c + lapacke_dggbak.c + lapacke_dggbak_work.c + lapacke_dggbal.c + lapacke_dggbal_work.c + lapacke_dgges.c + lapacke_dgges_work.c + lapacke_dggesx.c + lapacke_dggesx_work.c + lapacke_dggev.c + lapacke_dggev_work.c + lapacke_dggevx.c + lapacke_dggevx_work.c + lapacke_dggglm.c + lapacke_dggglm_work.c + lapacke_dgghrd.c + lapacke_dgghrd_work.c + lapacke_dgglse.c + lapacke_dgglse_work.c + lapacke_dggqrf.c + lapacke_dggqrf_work.c + lapacke_dggrqf.c + lapacke_dggrqf_work.c + lapacke_dggsvd.c + lapacke_dggsvd_work.c + lapacke_dggsvp.c + lapacke_dggsvp_work.c + lapacke_dgtcon.c + lapacke_dgtcon_work.c + lapacke_dgtrfs.c + lapacke_dgtrfs_work.c + lapacke_dgtsv.c + lapacke_dgtsv_work.c + lapacke_dgtsvx.c + lapacke_dgtsvx_work.c + lapacke_dgttrf.c + lapacke_dgttrf_work.c + lapacke_dgttrs.c + lapacke_dgttrs_work.c + lapacke_dhgeqz.c + lapacke_dhgeqz_work.c + lapacke_dhsein.c + lapacke_dhsein_work.c + lapacke_dhseqr.c + lapacke_dhseqr_work.c + lapacke_dlacn2.c + lapacke_dlacn2_work.c + lapacke_dlacpy.c + lapacke_dlacpy_work.c + lapacke_dlag2s.c + lapacke_dlag2s_work.c + lapacke_dlamch.c + lapacke_dlamch_work.c + lapacke_dlange.c + lapacke_dlange_work.c + lapacke_dlansy.c + lapacke_dlansy_work.c + lapacke_dlantr.c + lapacke_dlantr_work.c + lapacke_dlapmr.c + lapacke_dlapmr_work.c + lapacke_dlapy2.c + lapacke_dlapy2_work.c + lapacke_dlapy3.c + lapacke_dlapy3_work.c + lapacke_dlarfb.c + lapacke_dlarfb_work.c + lapacke_dlarfg.c + lapacke_dlarfg_work.c + lapacke_dlarft.c + lapacke_dlarft_work.c + lapacke_dlarfx.c + lapacke_dlarfx_work.c + lapacke_dlarnv.c + lapacke_dlarnv_work.c + lapacke_dlartgp.c + lapacke_dlartgp_work.c + lapacke_dlartgs.c + lapacke_dlartgs_work.c + lapacke_dlaset.c + lapacke_dlaset_work.c + lapacke_dlasrt.c + lapacke_dlasrt_work.c + lapacke_dlaswp.c + lapacke_dlaswp_work.c + lapacke_dlauum.c + lapacke_dlauum_work.c + lapacke_dopgtr.c + lapacke_dopgtr_work.c + lapacke_dopmtr.c + lapacke_dopmtr_work.c + lapacke_dorbdb.c + lapacke_dorbdb_work.c + lapacke_dorcsd.c + lapacke_dorcsd_work.c + lapacke_dorgbr.c + lapacke_dorgbr_work.c + lapacke_dorghr.c + lapacke_dorghr_work.c + lapacke_dorglq.c + lapacke_dorglq_work.c + lapacke_dorgql.c + lapacke_dorgql_work.c + lapacke_dorgqr.c + lapacke_dorgqr_work.c + lapacke_dorgrq.c + lapacke_dorgrq_work.c + lapacke_dorgtr.c + lapacke_dorgtr_work.c + lapacke_dormbr.c + lapacke_dormbr_work.c + lapacke_dormhr.c + lapacke_dormhr_work.c + lapacke_dormlq.c + lapacke_dormlq_work.c + lapacke_dormql.c + lapacke_dormql_work.c + lapacke_dormqr.c + lapacke_dormqr_work.c + lapacke_dormrq.c + lapacke_dormrq_work.c + lapacke_dormrz.c + lapacke_dormrz_work.c + lapacke_dormtr.c + lapacke_dormtr_work.c + lapacke_dpbcon.c + lapacke_dpbcon_work.c + lapacke_dpbequ.c + lapacke_dpbequ_work.c + lapacke_dpbrfs.c + lapacke_dpbrfs_work.c + lapacke_dpbstf.c + lapacke_dpbstf_work.c + lapacke_dpbsv.c + lapacke_dpbsv_work.c + lapacke_dpbsvx.c + lapacke_dpbsvx_work.c + lapacke_dpbtrf.c + lapacke_dpbtrf_work.c + lapacke_dpbtrs.c + lapacke_dpbtrs_work.c + lapacke_dpftrf.c + lapacke_dpftrf_work.c + lapacke_dpftri.c + lapacke_dpftri_work.c + lapacke_dpftrs.c + lapacke_dpftrs_work.c + lapacke_dpocon.c + lapacke_dpocon_work.c + lapacke_dpoequ.c + lapacke_dpoequ_work.c + lapacke_dpoequb.c + lapacke_dpoequb_work.c + lapacke_dporfs.c + lapacke_dporfs_work.c + lapacke_dposv.c + lapacke_dposv_work.c + lapacke_dposvx.c + lapacke_dposvx_work.c + lapacke_dpotrf.c + lapacke_dpotrf_work.c + lapacke_dpotri.c + lapacke_dpotri_work.c + lapacke_dpotrs.c + lapacke_dpotrs_work.c + lapacke_dppcon.c + lapacke_dppcon_work.c + lapacke_dppequ.c + lapacke_dppequ_work.c + lapacke_dpprfs.c + lapacke_dpprfs_work.c + lapacke_dppsv.c + lapacke_dppsv_work.c + lapacke_dppsvx.c + lapacke_dppsvx_work.c + lapacke_dpptrf.c + lapacke_dpptrf_work.c + lapacke_dpptri.c + lapacke_dpptri_work.c + lapacke_dpptrs.c + lapacke_dpptrs_work.c + lapacke_dpstrf.c + lapacke_dpstrf_work.c + lapacke_dptcon.c + lapacke_dptcon_work.c + lapacke_dpteqr.c + lapacke_dpteqr_work.c + lapacke_dptrfs.c + lapacke_dptrfs_work.c + lapacke_dptsv.c + lapacke_dptsv_work.c + lapacke_dptsvx.c + lapacke_dptsvx_work.c + lapacke_dpttrf.c + lapacke_dpttrf_work.c + lapacke_dpttrs.c + lapacke_dpttrs_work.c + lapacke_dsbev.c + lapacke_dsbev_work.c + lapacke_dsbevd.c + lapacke_dsbevd_work.c + lapacke_dsbevx.c + lapacke_dsbevx_work.c + lapacke_dsbgst.c + lapacke_dsbgst_work.c + lapacke_dsbgv.c + lapacke_dsbgv_work.c + lapacke_dsbgvd.c + lapacke_dsbgvd_work.c + lapacke_dsbgvx.c + lapacke_dsbgvx_work.c + lapacke_dsbtrd.c + lapacke_dsbtrd_work.c + lapacke_dsfrk.c + lapacke_dsfrk_work.c + lapacke_dsgesv.c + lapacke_dsgesv_work.c + lapacke_dspcon.c + lapacke_dspcon_work.c + lapacke_dspev.c + lapacke_dspev_work.c + lapacke_dspevd.c + lapacke_dspevd_work.c + lapacke_dspevx.c + lapacke_dspevx_work.c + lapacke_dspgst.c + lapacke_dspgst_work.c + lapacke_dspgv.c + lapacke_dspgv_work.c + lapacke_dspgvd.c + lapacke_dspgvd_work.c + lapacke_dspgvx.c + lapacke_dspgvx_work.c + lapacke_dsposv.c + lapacke_dsposv_work.c + lapacke_dsprfs.c + lapacke_dsprfs_work.c + lapacke_dspsv.c + lapacke_dspsv_work.c + lapacke_dspsvx.c + lapacke_dspsvx_work.c + lapacke_dsptrd.c + lapacke_dsptrd_work.c + lapacke_dsptrf.c + lapacke_dsptrf_work.c + lapacke_dsptri.c + lapacke_dsptri_work.c + lapacke_dsptrs.c + lapacke_dsptrs_work.c + lapacke_dstebz.c + lapacke_dstebz_work.c + lapacke_dstedc.c + lapacke_dstedc_work.c + lapacke_dstegr.c + lapacke_dstegr_work.c + lapacke_dstein.c + lapacke_dstein_work.c + lapacke_dstemr.c + lapacke_dstemr_work.c + lapacke_dsteqr.c + lapacke_dsteqr_work.c + lapacke_dsterf.c + lapacke_dsterf_work.c + lapacke_dstev.c + lapacke_dstev_work.c + lapacke_dstevd.c + lapacke_dstevd_work.c + lapacke_dstevr.c + lapacke_dstevr_work.c + lapacke_dstevx.c + lapacke_dstevx_work.c + lapacke_dsycon.c + lapacke_dsycon_work.c + lapacke_dsyconv.c + lapacke_dsyconv_work.c + lapacke_dsyequb.c + lapacke_dsyequb_work.c + lapacke_dsyev.c + lapacke_dsyev_work.c + lapacke_dsyevd.c + lapacke_dsyevd_work.c + lapacke_dsyevr.c + lapacke_dsyevr_work.c + lapacke_dsyevx.c + lapacke_dsyevx_work.c + lapacke_dsygst.c + lapacke_dsygst_work.c + lapacke_dsygv.c + lapacke_dsygv_work.c + lapacke_dsygvd.c + lapacke_dsygvd_work.c + lapacke_dsygvx.c + lapacke_dsygvx_work.c + lapacke_dsyrfs.c + lapacke_dsyrfs_work.c + lapacke_dsysv.c + lapacke_dsysv_rook.c + lapacke_dsysv_rook_work.c + lapacke_dsysv_work.c + lapacke_dsysvx.c + lapacke_dsysvx_work.c + lapacke_dsyswapr.c + lapacke_dsyswapr_work.c + lapacke_dsytrd.c + lapacke_dsytrd_work.c + lapacke_dsytrf.c + lapacke_dsytrf_work.c + lapacke_dsytri.c + lapacke_dsytri2.c + lapacke_dsytri2_work.c + lapacke_dsytri2x.c + lapacke_dsytri2x_work.c + lapacke_dsytri_work.c + lapacke_dsytrs.c + lapacke_dsytrs2.c + lapacke_dsytrs2_work.c + lapacke_dsytrs_work.c + lapacke_dtbcon.c + lapacke_dtbcon_work.c + lapacke_dtbrfs.c + lapacke_dtbrfs_work.c + lapacke_dtbtrs.c + lapacke_dtbtrs_work.c + lapacke_dtfsm.c + lapacke_dtfsm_work.c + lapacke_dtftri.c + lapacke_dtftri_work.c + lapacke_dtfttp.c + lapacke_dtfttp_work.c + lapacke_dtfttr.c + lapacke_dtfttr_work.c + lapacke_dtgevc.c + lapacke_dtgevc_work.c + lapacke_dtgexc.c + lapacke_dtgexc_work.c + lapacke_dtgsen.c + lapacke_dtgsen_work.c + lapacke_dtgsja.c + lapacke_dtgsja_work.c + lapacke_dtgsna.c + lapacke_dtgsna_work.c + lapacke_dtgsyl.c + lapacke_dtgsyl_work.c + lapacke_dtpcon.c + lapacke_dtpcon_work.c + lapacke_dtpmqrt.c + lapacke_dtpmqrt_work.c + lapacke_dtpqrt.c + lapacke_dtpqrt2.c + lapacke_dtpqrt2_work.c + lapacke_dtpqrt_work.c + lapacke_dtprfb.c + lapacke_dtprfb_work.c + lapacke_dtprfs.c + lapacke_dtprfs_work.c + lapacke_dtptri.c + lapacke_dtptri_work.c + lapacke_dtptrs.c + lapacke_dtptrs_work.c + lapacke_dtpttf.c + lapacke_dtpttf_work.c + lapacke_dtpttr.c + lapacke_dtpttr_work.c + lapacke_dtrcon.c + lapacke_dtrcon_work.c + lapacke_dtrevc.c + lapacke_dtrevc_work.c + lapacke_dtrexc.c + lapacke_dtrexc_work.c + lapacke_dtrrfs.c + lapacke_dtrrfs_work.c + lapacke_dtrsen.c + lapacke_dtrsen_work.c + lapacke_dtrsna.c + lapacke_dtrsna_work.c + lapacke_dtrsyl.c + lapacke_dtrsyl_work.c + lapacke_dtrtri.c + lapacke_dtrtri_work.c + lapacke_dtrtrs.c + lapacke_dtrtrs_work.c + lapacke_dtrttf.c + lapacke_dtrttf_work.c + lapacke_dtrttp.c + lapacke_dtrttp_work.c + lapacke_dtzrzf.c + lapacke_dtzrzf_work.c +) + +set(SSRC + lapacke_sbbcsd.c + lapacke_sbbcsd_work.c + lapacke_sbdsdc.c + lapacke_sbdsdc_work.c + lapacke_sbdsqr.c + lapacke_sbdsqr_work.c + lapacke_sdisna.c + lapacke_sdisna_work.c + lapacke_sgbbrd.c + lapacke_sgbbrd_work.c + lapacke_sgbcon.c + lapacke_sgbcon_work.c + lapacke_sgbequ.c + lapacke_sgbequ_work.c + lapacke_sgbequb.c + lapacke_sgbequb_work.c + lapacke_sgbrfs.c + lapacke_sgbrfs_work.c + lapacke_sgbsv.c + lapacke_sgbsv_work.c + lapacke_sgbsvx.c + lapacke_sgbsvx_work.c + lapacke_sgbtrf.c + lapacke_sgbtrf_work.c + lapacke_sgbtrs.c + lapacke_sgbtrs_work.c + lapacke_sgebak.c + lapacke_sgebak_work.c + lapacke_sgebal.c + lapacke_sgebal_work.c + lapacke_sgebrd.c + lapacke_sgebrd_work.c + lapacke_sgecon.c + lapacke_sgecon_work.c + lapacke_sgeequ.c + lapacke_sgeequ_work.c + lapacke_sgeequb.c + lapacke_sgeequb_work.c + lapacke_sgees.c + lapacke_sgees_work.c + lapacke_sgeesx.c + lapacke_sgeesx_work.c + lapacke_sgeev.c + lapacke_sgeev_work.c + lapacke_sgeevx.c + lapacke_sgeevx_work.c + lapacke_sgehrd.c + lapacke_sgehrd_work.c + lapacke_sgejsv.c + lapacke_sgejsv_work.c + lapacke_sgelq2.c + lapacke_sgelq2_work.c + lapacke_sgelqf.c + lapacke_sgelqf_work.c + lapacke_sgels.c + lapacke_sgels_work.c + lapacke_sgelsd.c + lapacke_sgelsd_work.c + lapacke_sgelss.c + lapacke_sgelss_work.c + lapacke_sgelsy.c + lapacke_sgelsy_work.c + lapacke_sgemqrt.c + lapacke_sgemqrt_work.c + lapacke_sgeqlf.c + lapacke_sgeqlf_work.c + lapacke_sgeqp3.c + lapacke_sgeqp3_work.c + lapacke_sgeqpf.c + lapacke_sgeqpf_work.c + lapacke_sgeqr2.c + lapacke_sgeqr2_work.c + lapacke_sgeqrf.c + lapacke_sgeqrf_work.c + lapacke_sgeqrfp.c + lapacke_sgeqrfp_work.c + lapacke_sgeqrt.c + lapacke_sgeqrt2.c + lapacke_sgeqrt2_work.c + lapacke_sgeqrt3.c + lapacke_sgeqrt3_work.c + lapacke_sgeqrt_work.c + lapacke_sgerfs.c + lapacke_sgerfs_work.c + lapacke_sgerqf.c + lapacke_sgerqf_work.c + lapacke_sgesdd.c + lapacke_sgesdd_work.c + lapacke_sgesv.c + lapacke_sgesv_work.c + lapacke_sgesvd.c + lapacke_sgesvd_work.c + lapacke_sgesvj.c + lapacke_sgesvj_work.c + lapacke_sgesvx.c + lapacke_sgesvx_work.c + lapacke_sgetf2.c + lapacke_sgetf2_work.c + lapacke_sgetrf.c + lapacke_sgetrf_work.c + lapacke_sgetri.c + lapacke_sgetri_work.c + lapacke_sgetrs.c + lapacke_sgetrs_work.c + lapacke_sggbak.c + lapacke_sggbak_work.c + lapacke_sggbal.c + lapacke_sggbal_work.c + lapacke_sgges.c + lapacke_sgges_work.c + lapacke_sggesx.c + lapacke_sggesx_work.c + lapacke_sggev.c + lapacke_sggev_work.c + lapacke_sggevx.c + lapacke_sggevx_work.c + lapacke_sggglm.c + lapacke_sggglm_work.c + lapacke_sgghrd.c + lapacke_sgghrd_work.c + lapacke_sgglse.c + lapacke_sgglse_work.c + lapacke_sggqrf.c + lapacke_sggqrf_work.c + lapacke_sggrqf.c + lapacke_sggrqf_work.c + lapacke_sggsvd.c + lapacke_sggsvd_work.c + lapacke_sggsvp.c + lapacke_sggsvp_work.c + lapacke_sgtcon.c + lapacke_sgtcon_work.c + lapacke_sgtrfs.c + lapacke_sgtrfs_work.c + lapacke_sgtsv.c + lapacke_sgtsv_work.c + lapacke_sgtsvx.c + lapacke_sgtsvx_work.c + lapacke_sgttrf.c + lapacke_sgttrf_work.c + lapacke_sgttrs.c + lapacke_sgttrs_work.c + lapacke_shgeqz.c + lapacke_shgeqz_work.c + lapacke_shsein.c + lapacke_shsein_work.c + lapacke_shseqr.c + lapacke_shseqr_work.c + lapacke_slacn2.c + lapacke_slacn2_work.c + lapacke_slacpy.c + lapacke_slacpy_work.c + lapacke_slag2d.c + lapacke_slag2d_work.c + lapacke_slamch.c + lapacke_slamch_work.c + lapacke_slange.c + lapacke_slange_work.c + lapacke_slansy.c + lapacke_slansy_work.c + lapacke_slantr.c + lapacke_slantr_work.c + lapacke_slapmr.c + lapacke_slapmr_work.c + lapacke_slapy2.c + lapacke_slapy2_work.c + lapacke_slapy3.c + lapacke_slapy3_work.c + lapacke_slarfb.c + lapacke_slarfb_work.c + lapacke_slarfg.c + lapacke_slarfg_work.c + lapacke_slarft.c + lapacke_slarft_work.c + lapacke_slarfx.c + lapacke_slarfx_work.c + lapacke_slarnv.c + lapacke_slarnv_work.c + lapacke_slartgp.c + lapacke_slartgp_work.c + lapacke_slartgs.c + lapacke_slartgs_work.c + lapacke_slaset.c + lapacke_slaset_work.c + lapacke_slasrt.c + lapacke_slasrt_work.c + lapacke_slaswp.c + lapacke_slaswp_work.c + lapacke_slauum.c + lapacke_slauum_work.c + lapacke_sopgtr.c + lapacke_sopgtr_work.c + lapacke_sopmtr.c + lapacke_sopmtr_work.c + lapacke_sorbdb.c + lapacke_sorbdb_work.c + lapacke_sorcsd.c + lapacke_sorcsd_work.c + lapacke_sorgbr.c + lapacke_sorgbr_work.c + lapacke_sorghr.c + lapacke_sorghr_work.c + lapacke_sorglq.c + lapacke_sorglq_work.c + lapacke_sorgql.c + lapacke_sorgql_work.c + lapacke_sorgqr.c + lapacke_sorgqr_work.c + lapacke_sorgrq.c + lapacke_sorgrq_work.c + lapacke_sorgtr.c + lapacke_sorgtr_work.c + lapacke_sormbr.c + lapacke_sormbr_work.c + lapacke_sormhr.c + lapacke_sormhr_work.c + lapacke_sormlq.c + lapacke_sormlq_work.c + lapacke_sormql.c + lapacke_sormql_work.c + lapacke_sormqr.c + lapacke_sormqr_work.c + lapacke_sormrq.c + lapacke_sormrq_work.c + lapacke_sormrz.c + lapacke_sormrz_work.c + lapacke_sormtr.c + lapacke_sormtr_work.c + lapacke_spbcon.c + lapacke_spbcon_work.c + lapacke_spbequ.c + lapacke_spbequ_work.c + lapacke_spbrfs.c + lapacke_spbrfs_work.c + lapacke_spbstf.c + lapacke_spbstf_work.c + lapacke_spbsv.c + lapacke_spbsv_work.c + lapacke_spbsvx.c + lapacke_spbsvx_work.c + lapacke_spbtrf.c + lapacke_spbtrf_work.c + lapacke_spbtrs.c + lapacke_spbtrs_work.c + lapacke_spftrf.c + lapacke_spftrf_work.c + lapacke_spftri.c + lapacke_spftri_work.c + lapacke_spftrs.c + lapacke_spftrs_work.c + lapacke_spocon.c + lapacke_spocon_work.c + lapacke_spoequ.c + lapacke_spoequ_work.c + lapacke_spoequb.c + lapacke_spoequb_work.c + lapacke_sporfs.c + lapacke_sporfs_work.c + lapacke_sposv.c + lapacke_sposv_work.c + lapacke_sposvx.c + lapacke_sposvx_work.c + lapacke_spotrf.c + lapacke_spotrf_work.c + lapacke_spotri.c + lapacke_spotri_work.c + lapacke_spotrs.c + lapacke_spotrs_work.c + lapacke_sppcon.c + lapacke_sppcon_work.c + lapacke_sppequ.c + lapacke_sppequ_work.c + lapacke_spprfs.c + lapacke_spprfs_work.c + lapacke_sppsv.c + lapacke_sppsv_work.c + lapacke_sppsvx.c + lapacke_sppsvx_work.c + lapacke_spptrf.c + lapacke_spptrf_work.c + lapacke_spptri.c + lapacke_spptri_work.c + lapacke_spptrs.c + lapacke_spptrs_work.c + lapacke_spstrf.c + lapacke_spstrf_work.c + lapacke_sptcon.c + lapacke_sptcon_work.c + lapacke_spteqr.c + lapacke_spteqr_work.c + lapacke_sptrfs.c + lapacke_sptrfs_work.c + lapacke_sptsv.c + lapacke_sptsv_work.c + lapacke_sptsvx.c + lapacke_sptsvx_work.c + lapacke_spttrf.c + lapacke_spttrf_work.c + lapacke_spttrs.c + lapacke_spttrs_work.c + lapacke_ssbev.c + lapacke_ssbev_work.c + lapacke_ssbevd.c + lapacke_ssbevd_work.c + lapacke_ssbevx.c + lapacke_ssbevx_work.c + lapacke_ssbgst.c + lapacke_ssbgst_work.c + lapacke_ssbgv.c + lapacke_ssbgv_work.c + lapacke_ssbgvd.c + lapacke_ssbgvd_work.c + lapacke_ssbgvx.c + lapacke_ssbgvx_work.c + lapacke_ssbtrd.c + lapacke_ssbtrd_work.c + lapacke_ssfrk.c + lapacke_ssfrk_work.c + lapacke_sspcon.c + lapacke_sspcon_work.c + lapacke_sspev.c + lapacke_sspev_work.c + lapacke_sspevd.c + lapacke_sspevd_work.c + lapacke_sspevx.c + lapacke_sspevx_work.c + lapacke_sspgst.c + lapacke_sspgst_work.c + lapacke_sspgv.c + lapacke_sspgv_work.c + lapacke_sspgvd.c + lapacke_sspgvd_work.c + lapacke_sspgvx.c + lapacke_sspgvx_work.c + lapacke_ssprfs.c + lapacke_ssprfs_work.c + lapacke_sspsv.c + lapacke_sspsv_work.c + lapacke_sspsvx.c + lapacke_sspsvx_work.c + lapacke_ssptrd.c + lapacke_ssptrd_work.c + lapacke_ssptrf.c + lapacke_ssptrf_work.c + lapacke_ssptri.c + lapacke_ssptri_work.c + lapacke_ssptrs.c + lapacke_ssptrs_work.c + lapacke_sstebz.c + lapacke_sstebz_work.c + lapacke_sstedc.c + lapacke_sstedc_work.c + lapacke_sstegr.c + lapacke_sstegr_work.c + lapacke_sstein.c + lapacke_sstein_work.c + lapacke_sstemr.c + lapacke_sstemr_work.c + lapacke_ssteqr.c + lapacke_ssteqr_work.c + lapacke_ssterf.c + lapacke_ssterf_work.c + lapacke_sstev.c + lapacke_sstev_work.c + lapacke_sstevd.c + lapacke_sstevd_work.c + lapacke_sstevr.c + lapacke_sstevr_work.c + lapacke_sstevx.c + lapacke_sstevx_work.c + lapacke_ssycon.c + lapacke_ssycon_work.c + lapacke_ssyconv.c + lapacke_ssyconv_work.c + lapacke_ssyequb.c + lapacke_ssyequb_work.c + lapacke_ssyev.c + lapacke_ssyev_work.c + lapacke_ssyevd.c + lapacke_ssyevd_work.c + lapacke_ssyevr.c + lapacke_ssyevr_work.c + lapacke_ssyevx.c + lapacke_ssyevx_work.c + lapacke_ssygst.c + lapacke_ssygst_work.c + lapacke_ssygv.c + lapacke_ssygv_work.c + lapacke_ssygvd.c + lapacke_ssygvd_work.c + lapacke_ssygvx.c + lapacke_ssygvx_work.c + lapacke_ssyrfs.c + lapacke_ssyrfs_work.c + lapacke_ssysv.c + lapacke_ssysv_rook.c + lapacke_ssysv_rook_work.c + lapacke_ssysv_work.c + lapacke_ssysvx.c + lapacke_ssysvx_work.c + lapacke_ssyswapr.c + lapacke_ssyswapr_work.c + lapacke_ssytrd.c + lapacke_ssytrd_work.c + lapacke_ssytrf.c + lapacke_ssytrf_work.c + lapacke_ssytri.c + lapacke_ssytri2.c + lapacke_ssytri2_work.c + lapacke_ssytri2x.c + lapacke_ssytri2x_work.c + lapacke_ssytri_work.c + lapacke_ssytrs.c + lapacke_ssytrs2.c + lapacke_ssytrs2_work.c + lapacke_ssytrs_work.c + lapacke_stbcon.c + lapacke_stbcon_work.c + lapacke_stbrfs.c + lapacke_stbrfs_work.c + lapacke_stbtrs.c + lapacke_stbtrs_work.c + lapacke_stfsm.c + lapacke_stfsm_work.c + lapacke_stftri.c + lapacke_stftri_work.c + lapacke_stfttp.c + lapacke_stfttp_work.c + lapacke_stfttr.c + lapacke_stfttr_work.c + lapacke_stgevc.c + lapacke_stgevc_work.c + lapacke_stgexc.c + lapacke_stgexc_work.c + lapacke_stgsen.c + lapacke_stgsen_work.c + lapacke_stgsja.c + lapacke_stgsja_work.c + lapacke_stgsna.c + lapacke_stgsna_work.c + lapacke_stgsyl.c + lapacke_stgsyl_work.c + lapacke_stpcon.c + lapacke_stpcon_work.c + lapacke_stpmqrt.c + lapacke_stpmqrt_work.c + lapacke_stpqrt2.c + lapacke_stpqrt2_work.c + lapacke_stprfb.c + lapacke_stprfb_work.c + lapacke_stprfs.c + lapacke_stprfs_work.c + lapacke_stptri.c + lapacke_stptri_work.c + lapacke_stptrs.c + lapacke_stptrs_work.c + lapacke_stpttf.c + lapacke_stpttf_work.c + lapacke_stpttr.c + lapacke_stpttr_work.c + lapacke_strcon.c + lapacke_strcon_work.c + lapacke_strevc.c + lapacke_strevc_work.c + lapacke_strexc.c + lapacke_strexc_work.c + lapacke_strrfs.c + lapacke_strrfs_work.c + lapacke_strsen.c + lapacke_strsen_work.c + lapacke_strsna.c + lapacke_strsna_work.c + lapacke_strsyl.c + lapacke_strsyl_work.c + lapacke_strtri.c + lapacke_strtri_work.c + lapacke_strtrs.c + lapacke_strtrs_work.c + lapacke_strttf.c + lapacke_strttf_work.c + lapacke_strttp.c + lapacke_strttp_work.c + lapacke_stzrzf.c + lapacke_stzrzf_work.c +) + +set(ZSRC + lapacke_zbbcsd.c + lapacke_zbbcsd_work.c + lapacke_zbdsqr.c + lapacke_zbdsqr_work.c + lapacke_zcgesv.c + lapacke_zcgesv_work.c + lapacke_zcposv.c + lapacke_zcposv_work.c + lapacke_zgbbrd.c + lapacke_zgbbrd_work.c + lapacke_zgbcon.c + lapacke_zgbcon_work.c + lapacke_zgbequ.c + lapacke_zgbequ_work.c + lapacke_zgbequb.c + lapacke_zgbequb_work.c + lapacke_zgbrfs.c + lapacke_zgbrfs_work.c + lapacke_zgbsv.c + lapacke_zgbsv_work.c + lapacke_zgbsvx.c + lapacke_zgbsvx_work.c + lapacke_zgbtrf.c + lapacke_zgbtrf_work.c + lapacke_zgbtrs.c + lapacke_zgbtrs_work.c + lapacke_zgebak.c + lapacke_zgebak_work.c + lapacke_zgebal.c + lapacke_zgebal_work.c + lapacke_zgebrd.c + lapacke_zgebrd_work.c + lapacke_zgecon.c + lapacke_zgecon_work.c + lapacke_zgeequ.c + lapacke_zgeequ_work.c + lapacke_zgeequb.c + lapacke_zgeequb_work.c + lapacke_zgees.c + lapacke_zgees_work.c + lapacke_zgeesx.c + lapacke_zgeesx_work.c + lapacke_zgeev.c + lapacke_zgeev_work.c + lapacke_zgeevx.c + lapacke_zgeevx_work.c + lapacke_zgehrd.c + lapacke_zgehrd_work.c + lapacke_zgelq2.c + lapacke_zgelq2_work.c + lapacke_zgelqf.c + lapacke_zgelqf_work.c + lapacke_zgels.c + lapacke_zgels_work.c + lapacke_zgelsd.c + lapacke_zgelsd_work.c + lapacke_zgelss.c + lapacke_zgelss_work.c + lapacke_zgelsy.c + lapacke_zgelsy_work.c + lapacke_zgemqrt.c + lapacke_zgemqrt_work.c + lapacke_zgeqlf.c + lapacke_zgeqlf_work.c + lapacke_zgeqp3.c + lapacke_zgeqp3_work.c + lapacke_zgeqpf.c + lapacke_zgeqpf_work.c + lapacke_zgeqr2.c + lapacke_zgeqr2_work.c + lapacke_zgeqrf.c + lapacke_zgeqrf_work.c + lapacke_zgeqrfp.c + lapacke_zgeqrfp_work.c + lapacke_zgeqrt.c + lapacke_zgeqrt2.c + lapacke_zgeqrt2_work.c + lapacke_zgeqrt3.c + lapacke_zgeqrt3_work.c + lapacke_zgeqrt_work.c + lapacke_zgerfs.c + lapacke_zgerfs_work.c + lapacke_zgerqf.c + lapacke_zgerqf_work.c + lapacke_zgesdd.c + lapacke_zgesdd_work.c + lapacke_zgesv.c + lapacke_zgesv_work.c + lapacke_zgesvd.c + lapacke_zgesvd_work.c + lapacke_zgesvx.c + lapacke_zgesvx_work.c + lapacke_zgetf2.c + lapacke_zgetf2_work.c + lapacke_zgetrf.c + lapacke_zgetrf_work.c + lapacke_zgetri.c + lapacke_zgetri_work.c + lapacke_zgetrs.c + lapacke_zgetrs_work.c + lapacke_zggbak.c + lapacke_zggbak_work.c + lapacke_zggbal.c + lapacke_zggbal_work.c + lapacke_zgges.c + lapacke_zgges_work.c + lapacke_zggesx.c + lapacke_zggesx_work.c + lapacke_zggev.c + lapacke_zggev_work.c + lapacke_zggevx.c + lapacke_zggevx_work.c + lapacke_zggglm.c + lapacke_zggglm_work.c + lapacke_zgghrd.c + lapacke_zgghrd_work.c + lapacke_zgglse.c + lapacke_zgglse_work.c + lapacke_zggqrf.c + lapacke_zggqrf_work.c + lapacke_zggrqf.c + lapacke_zggrqf_work.c + lapacke_zggsvd.c + lapacke_zggsvd_work.c + lapacke_zggsvp.c + lapacke_zggsvp_work.c + lapacke_zgtcon.c + lapacke_zgtcon_work.c + lapacke_zgtrfs.c + lapacke_zgtrfs_work.c + lapacke_zgtsv.c + lapacke_zgtsv_work.c + lapacke_zgtsvx.c + lapacke_zgtsvx_work.c + lapacke_zgttrf.c + lapacke_zgttrf_work.c + lapacke_zgttrs.c + lapacke_zgttrs_work.c + lapacke_zhbev.c + lapacke_zhbev_work.c + lapacke_zhbevd.c + lapacke_zhbevd_work.c + lapacke_zhbevx.c + lapacke_zhbevx_work.c + lapacke_zhbgst.c + lapacke_zhbgst_work.c + lapacke_zhbgv.c + lapacke_zhbgv_work.c + lapacke_zhbgvd.c + lapacke_zhbgvd_work.c + lapacke_zhbgvx.c + lapacke_zhbgvx_work.c + lapacke_zhbtrd.c + lapacke_zhbtrd_work.c + lapacke_zhecon.c + lapacke_zhecon_work.c + lapacke_zheequb.c + lapacke_zheequb_work.c + lapacke_zheev.c + lapacke_zheev_work.c + lapacke_zheevd.c + lapacke_zheevd_work.c + lapacke_zheevr.c + lapacke_zheevr_work.c + lapacke_zheevx.c + lapacke_zheevx_work.c + lapacke_zhegst.c + lapacke_zhegst_work.c + lapacke_zhegv.c + lapacke_zhegv_work.c + lapacke_zhegvd.c + lapacke_zhegvd_work.c + lapacke_zhegvx.c + lapacke_zhegvx_work.c + lapacke_zherfs.c + lapacke_zherfs_work.c + lapacke_zhesv.c + lapacke_zhesv_work.c + lapacke_zhesvx.c + lapacke_zhesvx_work.c + lapacke_zheswapr.c + lapacke_zheswapr_work.c + lapacke_zhetrd.c + lapacke_zhetrd_work.c + lapacke_zhetrf.c + lapacke_zhetrf_work.c + lapacke_zhetri.c + lapacke_zhetri2.c + lapacke_zhetri2_work.c + lapacke_zhetri2x.c + lapacke_zhetri2x_work.c + lapacke_zhetri_work.c + lapacke_zhetrs.c + lapacke_zhetrs2.c + lapacke_zhetrs2_work.c + lapacke_zhetrs_work.c + lapacke_zhfrk.c + lapacke_zhfrk_work.c + lapacke_zhgeqz.c + lapacke_zhgeqz_work.c + lapacke_zhpcon.c + lapacke_zhpcon_work.c + lapacke_zhpev.c + lapacke_zhpev_work.c + lapacke_zhpevd.c + lapacke_zhpevd_work.c + lapacke_zhpevx.c + lapacke_zhpevx_work.c + lapacke_zhpgst.c + lapacke_zhpgst_work.c + lapacke_zhpgv.c + lapacke_zhpgv_work.c + lapacke_zhpgvd.c + lapacke_zhpgvd_work.c + lapacke_zhpgvx.c + lapacke_zhpgvx_work.c + lapacke_zhprfs.c + lapacke_zhprfs_work.c + lapacke_zhpsv.c + lapacke_zhpsv_work.c + lapacke_zhpsvx.c + lapacke_zhpsvx_work.c + lapacke_zhptrd.c + lapacke_zhptrd_work.c + lapacke_zhptrf.c + lapacke_zhptrf_work.c + lapacke_zhptri.c + lapacke_zhptri_work.c + lapacke_zhptrs.c + lapacke_zhptrs_work.c + lapacke_zhsein.c + lapacke_zhsein_work.c + lapacke_zhseqr.c + lapacke_zhseqr_work.c + lapacke_zlacgv.c + lapacke_zlacgv_work.c + lapacke_zlacn2.c + lapacke_zlacn2_work.c + lapacke_zlacp2.c + lapacke_zlacp2_work.c + lapacke_zlacpy.c + lapacke_zlacpy_work.c + lapacke_zlag2c.c + lapacke_zlag2c_work.c + lapacke_zlange.c + lapacke_zlange_work.c + lapacke_zlanhe.c + lapacke_zlanhe_work.c + lapacke_zlansy.c + lapacke_zlansy_work.c + lapacke_zlantr.c + lapacke_zlantr_work.c + lapacke_zlapmr.c + lapacke_zlapmr_work.c + lapacke_zlarfb.c + lapacke_zlarfb_work.c + lapacke_zlarfg.c + lapacke_zlarfg_work.c + lapacke_zlarft.c + lapacke_zlarft_work.c + lapacke_zlarfx.c + lapacke_zlarfx_work.c + lapacke_zlarnv.c + lapacke_zlarnv_work.c + lapacke_zlaset.c + lapacke_zlaset_work.c + lapacke_zlaswp.c + lapacke_zlaswp_work.c + lapacke_zlauum.c + lapacke_zlauum_work.c + lapacke_zpbcon.c + lapacke_zpbcon_work.c + lapacke_zpbequ.c + lapacke_zpbequ_work.c + lapacke_zpbrfs.c + lapacke_zpbrfs_work.c + lapacke_zpbstf.c + lapacke_zpbstf_work.c + lapacke_zpbsv.c + lapacke_zpbsv_work.c + lapacke_zpbsvx.c + lapacke_zpbsvx_work.c + lapacke_zpbtrf.c + lapacke_zpbtrf_work.c + lapacke_zpbtrs.c + lapacke_zpbtrs_work.c + lapacke_zpftrf.c + lapacke_zpftrf_work.c + lapacke_zpftri.c + lapacke_zpftri_work.c + lapacke_zpftrs.c + lapacke_zpftrs_work.c + lapacke_zpocon.c + lapacke_zpocon_work.c + lapacke_zpoequ.c + lapacke_zpoequ_work.c + lapacke_zpoequb.c + lapacke_zpoequb_work.c + lapacke_zporfs.c + lapacke_zporfs_work.c + lapacke_zposv.c + lapacke_zposv_work.c + lapacke_zposvx.c + lapacke_zposvx_work.c + lapacke_zpotrf.c + lapacke_zpotrf_work.c + lapacke_zpotri.c + lapacke_zpotri_work.c + lapacke_zpotrs.c + lapacke_zpotrs_work.c + lapacke_zppcon.c + lapacke_zppcon_work.c + lapacke_zppequ.c + lapacke_zppequ_work.c + lapacke_zpprfs.c + lapacke_zpprfs_work.c + lapacke_zppsv.c + lapacke_zppsv_work.c + lapacke_zppsvx.c + lapacke_zppsvx_work.c + lapacke_zpptrf.c + lapacke_zpptrf_work.c + lapacke_zpptri.c + lapacke_zpptri_work.c + lapacke_zpptrs.c + lapacke_zpptrs_work.c + lapacke_zpstrf.c + lapacke_zpstrf_work.c + lapacke_zptcon.c + lapacke_zptcon_work.c + lapacke_zpteqr.c + lapacke_zpteqr_work.c + lapacke_zptrfs.c + lapacke_zptrfs_work.c + lapacke_zptsv.c + lapacke_zptsv_work.c + lapacke_zptsvx.c + lapacke_zptsvx_work.c + lapacke_zpttrf.c + lapacke_zpttrf_work.c + lapacke_zpttrs.c + lapacke_zpttrs_work.c + lapacke_zspcon.c + lapacke_zspcon_work.c + lapacke_zsprfs.c + lapacke_zsprfs_work.c + lapacke_zspsv.c + lapacke_zspsv_work.c + lapacke_zspsvx.c + lapacke_zspsvx_work.c + lapacke_zsptrf.c + lapacke_zsptrf_work.c + lapacke_zsptri.c + lapacke_zsptri_work.c + lapacke_zsptrs.c + lapacke_zsptrs_work.c + lapacke_zstedc.c + lapacke_zstedc_work.c + lapacke_zstegr.c + lapacke_zstegr_work.c + lapacke_zstein.c + lapacke_zstein_work.c + lapacke_zstemr.c + lapacke_zstemr_work.c + lapacke_zsteqr.c + lapacke_zsteqr_work.c + lapacke_zsycon.c + lapacke_zsycon_work.c + lapacke_zsyconv.c + lapacke_zsyconv_work.c + lapacke_zsyequb.c + lapacke_zsyequb_work.c + lapacke_zsyrfs.c + lapacke_zsyrfs_work.c + lapacke_zsysv.c + lapacke_zsysv_rook.c + lapacke_zsysv_rook_work.c + lapacke_zsysv_work.c + lapacke_zsysvx.c + lapacke_zsysvx_work.c + lapacke_zsyswapr.c + lapacke_zsyswapr_work.c + lapacke_zsytrf.c + lapacke_zsytrf_work.c + lapacke_zsytri.c + lapacke_zsytri2.c + lapacke_zsytri2_work.c + lapacke_zsytri2x.c + lapacke_zsytri2x_work.c + lapacke_zsytri_work.c + lapacke_zsytrs.c + lapacke_zsytrs2.c + lapacke_zsytrs2_work.c + lapacke_zsytrs_work.c + lapacke_ztbcon.c + lapacke_ztbcon_work.c + lapacke_ztbrfs.c + lapacke_ztbrfs_work.c + lapacke_ztbtrs.c + lapacke_ztbtrs_work.c + lapacke_ztfsm.c + lapacke_ztfsm_work.c + lapacke_ztftri.c + lapacke_ztftri_work.c + lapacke_ztfttp.c + lapacke_ztfttp_work.c + lapacke_ztfttr.c + lapacke_ztfttr_work.c + lapacke_ztgevc.c + lapacke_ztgevc_work.c + lapacke_ztgexc.c + lapacke_ztgexc_work.c + lapacke_ztgsen.c + lapacke_ztgsen_work.c + lapacke_ztgsja.c + lapacke_ztgsja_work.c + lapacke_ztgsna.c + lapacke_ztgsna_work.c + lapacke_ztgsyl.c + lapacke_ztgsyl_work.c + lapacke_ztpcon.c + lapacke_ztpcon_work.c + lapacke_ztpmqrt.c + lapacke_ztpmqrt_work.c + lapacke_ztpqrt.c + lapacke_ztpqrt2.c + lapacke_ztpqrt2_work.c + lapacke_ztpqrt_work.c + lapacke_ztprfb.c + lapacke_ztprfb_work.c + lapacke_ztprfs.c + lapacke_ztprfs_work.c + lapacke_ztptri.c + lapacke_ztptri_work.c + lapacke_ztptrs.c + lapacke_ztptrs_work.c + lapacke_ztpttf.c + lapacke_ztpttf_work.c + lapacke_ztpttr.c + lapacke_ztpttr_work.c + lapacke_ztrcon.c + lapacke_ztrcon_work.c + lapacke_ztrevc.c + lapacke_ztrevc_work.c + lapacke_ztrexc.c + lapacke_ztrexc_work.c + lapacke_ztrrfs.c + lapacke_ztrrfs_work.c + lapacke_ztrsen.c + lapacke_ztrsen_work.c + lapacke_ztrsna.c + lapacke_ztrsna_work.c + lapacke_ztrsyl.c + lapacke_ztrsyl_work.c + lapacke_ztrtri.c + lapacke_ztrtri_work.c + lapacke_ztrtrs.c + lapacke_ztrtrs_work.c + lapacke_ztrttf.c + lapacke_ztrttf_work.c + lapacke_ztrttp.c + lapacke_ztrttp_work.c + lapacke_ztzrzf.c + lapacke_ztzrzf_work.c + lapacke_zunbdb.c + lapacke_zunbdb_work.c + lapacke_zuncsd.c + lapacke_zuncsd_work.c + lapacke_zungbr.c + lapacke_zungbr_work.c + lapacke_zunghr.c + lapacke_zunghr_work.c + lapacke_zunglq.c + lapacke_zunglq_work.c + lapacke_zungql.c + lapacke_zungql_work.c + lapacke_zungqr.c + lapacke_zungqr_work.c + lapacke_zungrq.c + lapacke_zungrq_work.c + lapacke_zungtr.c + lapacke_zungtr_work.c + lapacke_zunmbr.c + lapacke_zunmbr_work.c + lapacke_zunmhr.c + lapacke_zunmhr_work.c + lapacke_zunmlq.c + lapacke_zunmlq_work.c + lapacke_zunmql.c + lapacke_zunmql_work.c + lapacke_zunmqr.c + lapacke_zunmqr_work.c + lapacke_zunmrq.c + lapacke_zunmrq_work.c + lapacke_zunmrz.c + lapacke_zunmrz_work.c + lapacke_zunmtr.c + lapacke_zunmtr_work.c + lapacke_zupgtr.c + lapacke_zupgtr_work.c + lapacke_zupmtr.c + lapacke_zupmtr_work.c + lapacke_zsyr.c + lapacke_csyr.c + lapacke_zsyr_work.c + lapacke_csyr_work.c + lapacke_ilaver.c +) + +set(SRCX + lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c + lapacke_cgbrfsx_work.c lapacke_cporfsx_work.c lapacke_dgerfsx_work.c lapacke_sgbrfsx_work.c lapacke_ssyrfsx_work.c lapacke_zherfsx_work.c + lapacke_cgerfsx.c lapacke_csyrfsx.c lapacke_dporfsx.c lapacke_sgerfsx.c lapacke_zgbrfsx.c lapacke_zporfsx.c + lapacke_cgerfsx_work.c lapacke_csyrfsx_work.c lapacke_dporfsx_work.c lapacke_sgerfsx_work.c lapacke_zgbrfsx_work.c lapacke_zporfsx_work.c + lapacke_cherfsx.c lapacke_dgbrfsx.c lapacke_dsyrfsx.c lapacke_sporfsx.c lapacke_zgerfsx.c lapacke_zsyrfsx.c + lapacke_cherfsx_work.c lapacke_dgbrfsx_work.c lapacke_dsyrfsx_work.c lapacke_sporfsx_work.c lapacke_zgerfsx_work.c lapacke_zsyrfsx_work.c + lapacke_cgbsvxx.c lapacke_cposvxx.c lapacke_dgesvxx.c lapacke_sgbsvxx.c lapacke_ssysvxx.c lapacke_zhesvxx.c + lapacke_cgbsvxx_work.c lapacke_cposvxx_work.c lapacke_dgesvxx_work.c lapacke_sgbsvxx_work.c lapacke_ssysvxx_work.c lapacke_zhesvxx_work.c + lapacke_cgesvxx.c lapacke_csysvxx.c lapacke_dposvxx.c lapacke_sgesvxx.c lapacke_zgbsvxx.c lapacke_zposvxx.c + lapacke_cgesvxx_work.c lapacke_csysvxx_work.c lapacke_dposvxx_work.c lapacke_sgesvxx_work.c lapacke_zgbsvxx_work.c lapacke_zposvxx_work.c + lapacke_chesvxx.c lapacke_dgbsvxx.c lapacke_dsysvxx.c lapacke_sposvxx.c lapacke_zgesvxx.c lapacke_zsysvxx.c + lapacke_chesvxx_work.c lapacke_dgbsvxx_work.c lapacke_dsysvxx_work.c lapacke_sposvxx_work.c lapacke_zgesvxx_work.c lapacke_zsysvxx_work.c +) + + +# FILE PARTS OF TMGLIB +set(MATGEN + lapacke_clatms.c + lapacke_clatms_work.c + lapacke_dlatms.c + lapacke_dlatms_work.c + lapacke_slatms.c + lapacke_slatms_work.c + lapacke_zlatms.c + lapacke_zlatms_work.c + lapacke_clagge.c + lapacke_clagge_work.c + lapacke_dlagge.c + lapacke_dlagge_work.c + lapacke_slagge.c + lapacke_slagge_work.c + lapacke_zlagge.c + lapacke_zlagge_work.c + lapacke_claghe.c + lapacke_claghe_work.c + lapacke_zlaghe.c + lapacke_zlaghe_work.c + lapacke_clagsy.c + lapacke_clagsy_work.c + lapacke_dlagsy.c + lapacke_dlagsy_work.c + lapacke_slagsy.c + lapacke_slagsy_work.c + lapacke_zlagsy.c + lapacke_zlagsy_work.c +) + +set(LAPACKE_REL_SRC "") +if (BUILD_SINGLE) + list(APPEND LAPACKE_REL_SRC ${SSRC}) +endif () + +if (BUILD_DOUBLE) + list(APPEND LAPACKE_REL_SRC ${DSRC}) +endif () + +if (BUILD_COMPLEX) + list(APPEND LAPACKE_REL_SRC ${CSRC}) +endif () + +if (BUILD_COMPLEX16) + list(APPEND LAPACKE_REL_SRC ${ZSRC}) +endif () + +# add lapack-netlib folder to the sources +set(LAPACKE_SOURCES "") +foreach (LAE_FILE ${LAPACKE_REL_SRC}) + list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}") +endforeach () + +set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include") +execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h") +include_directories(${lapacke_include_dir}) +set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/cmake/os.cmake b/cmake/os.cmake new file mode 100644 index 000000000..f5a75027c --- /dev/null +++ b/cmake/os.cmake @@ -0,0 +1,104 @@ +## +## Author: Hank Anderson +## Description: Ported from portion of OpenBLAS/Makefile.system +## Detects the OS and sets appropriate variables. + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") + set(MD5SUM "md5 -r") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD") + set(MD5SUM "md5 -n") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(EXTRALIB "${EXTRALIB} -lm") + set(NO_EXPRECISION 1) +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") + set(EXTRALIB "${EXTRALIB} -lm") +endif () + +# TODO: this is probably meant for mingw, not other windows compilers +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32") + + # probably not going to use these + set(SUFFIX "obj") + set(PSUFFIX "pobj") + set(LIBSUFFIX "a") + + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + + # Test for supporting MS_ABI + # removed string parsing in favor of CMake's version comparison -hpa + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + # GCC Version >=4.7 + # It is compatible with MSVC ABI. + set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") + endif () + endif () + + # Ensure the correct stack alignment on Win32 + # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 + if (${ARCH} STREQUAL "x86") + if (NOT MSVC AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") + endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") + endif () + +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + set(NEED_PIC 0) + set(NO_EXPRECISION 1) + + set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin") +endif () + +if (CYGWIN) + set(NEED_PIC 0) + set(NO_EXPRECISION 1) +endif () + +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") + if (SMP) + set(EXTRALIB "${EXTRALIB} -lpthread") + endif () +endif () + +if (QUAD_PRECISION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION") + set(NO_EXPRECISION 1) +endif () + +if (${ARCH} STREQUAL "x86") + set(NO_EXPRECISION 1) +endif () + +if (UTEST_CHECK) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") + set(SANITY_CHECK 1) +endif () + +if (SANITY_CHECK) + # TODO: need some way to get $(*F) (target filename) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") +endif () + diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake new file mode 100644 index 000000000..c3fa48655 --- /dev/null +++ b/cmake/prebuild.cmake @@ -0,0 +1,113 @@ +## +## Author: Hank Anderson +## Description: Ported from OpenBLAS/Makefile.prebuild +## This is triggered by system.cmake and runs before any of the code is built. +## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). +## Next it runs f_check and appends some fortran information to the files. +## Finally it runs getarch and getarch_2nd for even more environment information. + +# CMake vars set by this file: +# CORE +# LIBCORE +# NUM_CORES +# HAVE_MMX +# HAVE_SSE +# HAVE_SSE2 +# HAVE_SSE3 +# MAKE +# SGEMM_UNROLL_M +# SGEMM_UNROLL_N +# DGEMM_UNROLL_M +# DGEMM_UNROLL_M +# QGEMM_UNROLL_N +# QGEMM_UNROLL_N +# CGEMM_UNROLL_M +# CGEMM_UNROLL_M +# ZGEMM_UNROLL_N +# ZGEMM_UNROLL_N +# XGEMM_UNROLL_M +# XGEMM_UNROLL_N +# CGEMM3M_UNROLL_M +# CGEMM3M_UNROLL_N +# ZGEMM3M_UNROLL_M +# ZGEMM3M_UNROLL_M +# XGEMM3M_UNROLL_N +# XGEMM3M_UNROLL_N + +# CPUIDEMU = ../../cpuid/table.o + +if (DEFINED CPUIDEMU) + set(EXFLAGS "-DCPUIDEMU -DVENDOR=99") +endif () + +if (DEFINED TARGET_CORE) + # set the C flags for just this file + set(GETARCH2_FLAGS "-DBUILD_KERNEL") + set(TARGET_MAKE "Makefile_kernel.conf") + set(TARGET_CONF "config_kernel.h") +else() + set(TARGET_MAKE "Makefile.conf") + set(TARGET_CONF "config.h") +endif () + +include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") + +if (NOT NOFORTRAN) + include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") +endif () + +# compile getarch +set(GETARCH_SRC + ${CMAKE_SOURCE_DIR}/getarch.c + ${CPUIDEMO} +) + +if (NOT MSVC) + list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) +endif () + +if (MSVC) +#Use generic for MSVC now +set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) +endif() + +set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") +set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") +file(MAKE_DIRECTORY ${GETARCH_DIR}) +try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} +) + +message(STATUS "Running getarch") + +# use the cmake binary w/ the -E param to run a shell command in a cross-platform way +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) + +message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") + +# append config data from getarch to the TARGET file and read in CMake vars +file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) +ParseGetArchVars(${GETARCH_MAKE_OUT}) + +set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") +set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") +file(MAKE_DIRECTORY ${GETARCH2_DIR}) +try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} +) + +# use the cmake binary w/ the -E param to run a shell command in a cross-platform way +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) +execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) + +# append config data from getarch_2nd to the TARGET file and read in CMake vars +file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) +ParseGetArchVars(${GETARCH2_MAKE_OUT}) + diff --git a/cmake/system.cmake b/cmake/system.cmake new file mode 100644 index 000000000..134e9c12d --- /dev/null +++ b/cmake/system.cmake @@ -0,0 +1,552 @@ +## +## Author: Hank Anderson +## Description: Ported from OpenBLAS/Makefile.system +## + +set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") + +# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa +# http://stackoverflow.com/questions/714100/os-detecting-makefile + +# TODO: Makefile.system sets HOSTCC = $(CC) here if not already set -hpa + +# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. +if (DEFINED TARGET_CORE) + set(TARGET ${TARGET_CORE}) +endif () + +# Force fallbacks for 32bit +if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) + message(STATUS "Compiling a ${BINARY}-bit binary.") + set(NO_AVX 1) + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + set(TARGET "NEHALEM") + endif () + if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER") + set(TARGET "BARCELONA") + endif () +endif () + +if (DEFINED TARGET) + message(STATUS "Targetting the ${TARGET} architecture.") + set(GETARCH_FLAGS "-DFORCE_${TARGET}") +endif () + +if (INTERFACE64) + message(STATUS "Using 64-bit integers.") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") +endif () + +if (NOT DEFINED GEMM_MULTITHREAD_THRESHOLD) + set(GEMM_MULTITHREAD_THRESHOLD 4) +endif () +message(STATUS "GEMM multithread threshold set to ${GEMM_MULTITHREAD_THRESHOLD}.") +set(GETARCH_FLAGS "${GETARCH_FLAGS} -DGEMM_MULTITHREAD_THRESHOLD=${GEMM_MULTITHREAD_THRESHOLD}") + +if (NO_AVX) + message(STATUS "Disabling Advanced Vector Extensions (AVX).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX") +endif () + +if (NO_AVX2) + message(STATUS "Disabling Advanced Vector Extensions 2 (AVX2).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") +endif () + +if (CMAKE_BUILD_TYPE STREQUAL Debug) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -g") +endif () + +# TODO: let CMake handle this? -hpa +#if (${QUIET_MAKE}) +# set(MAKE "${MAKE} -s") +#endif() + +if (NOT DEFINED NO_PARALLEL_MAKE) + set(NO_PARALLEL_MAKE 0) +endif () +set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_PARALLEL_MAKE=${NO_PARALLEL_MAKE}") + +if (CMAKE_CXX_COMPILER STREQUAL loongcc) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") +endif () + +#if don't use Fortran, it will only compile CBLAS. +if (ONLY_CBLAS) + set(NO_LAPACK 1) +else () + set(ONLY_CBLAS 0) +endif () + +include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") + +if (NOT DEFINED NUM_THREADS) + set(NUM_THREADS ${NUM_CORES}) +endif () + +if (${NUM_THREADS} EQUAL 1) + set(USE_THREAD 0) +endif () + +if (DEFINED USE_THREAD) + if (NOT ${USE_THREAD}) + unset(SMP) + else () + set(SMP 1) + endif () +else () + # N.B. this is NUM_THREAD in Makefile.system which is probably a bug -hpa + if (${NUM_THREADS} EQUAL 1) + unset(SMP) + else () + set(SMP 1) + endif () +endif () + +if (${SMP}) + message(STATUS "SMP enabled.") +endif () + +if (NOT DEFINED NEED_PIC) + set(NEED_PIC 1) +endif () + +# TODO: I think CMake should be handling all this stuff -hpa +unset(ARFLAGS) +set(CPP "${COMPILER} -E") +set(AR "${CROSS_SUFFIX}ar") +set(AS "${CROSS_SUFFIX}as") +set(LD "${CROSS_SUFFIX}ld") +set(RANLIB "${CROSS_SUFFIX}ranlib") +set(NM "${CROSS_SUFFIX}nm") +set(DLLWRAP "${CROSS_SUFFIX}dllwrap") +set(OBJCOPY "${CROSS_SUFFIX}objcopy") +set(OBJCONV "${CROSS_SUFFIX}objconv") + +# OS dependent settings +include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") + +# Architecture dependent settings +include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") + +# C Compiler dependent settings +include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") + +if (NOT NOFORTRAN) + # Fortran Compiler dependent settings + include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") +endif () + +if (BINARY64) + if (INTERFACE64) + # CCOMMON_OPT += -DUSE64BITINT + endif () +endif () + +if (NEED_PIC) + if (${CMAKE_C_COMPILER} STREQUAL "IBM") + set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC") + endif () + + if (${F_COMPILER} STREQUAL "SUN") + set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") + endif () +endif () + +if (DYNAMIC_ARCH) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") +endif () + +if (NO_LAPACK) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") + #Disable LAPACK C interface + set(NO_LAPACKE 1) +endif () + +if (NO_LAPACKE) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACKE") +endif () + +if (NO_AVX) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") +endif () + +if (${ARCH} STREQUAL "x86") + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") +endif () + +if (NO_AVX2) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") +endif () + +if (SMP) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSMP_SERVER") + + if (${ARCH} STREQUAL "mips64") + if (NOT ${CORE} STREQUAL "LOONGSON3B") + set(USE_SIMPLE_THREADED_LEVEL3 1) + endif () + endif () + + if (USE_OPENMP) + # USE_SIMPLE_THREADED_LEVEL3 = 1 + # NO_AFFINITY = 1 + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") + endif () + + if (BIGNUMA) + set(CCOMMON_OPT "${CCOMMON_OPT} -DBIGNUMA") + endif () + +endif () + +if (NO_WARMUP) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_WARMUP") +endif () + +if (CONSISTENT_FPCSR) + set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") +endif () + +# Only for development +# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") +# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") +# set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_SWITCHING") +# set(USE_PAPI 1) + +if (USE_PAPI) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_PAPI") + set(EXTRALIB "${EXTRALIB} -lpapi -lperfctr") +endif () + +if (DYNAMIC_THREADS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_THREADS") +endif () + +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") + +if (USE_SIMPLE_THREADED_LEVEL3) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") +endif () + +if (DEFINED LIBNAMESUFFIX) + set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") +else () + set(LIBPREFIX "libopenblas") +endif () + +if (NOT DEFINED SYMBOLPREFIX) + set(SYMBOLPREFIX "") +endif () + +if (NOT DEFINED SYMBOLSUFFIX) + set(SYMBOLSUFFIX "") +endif () + +set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") + +# TODO: nead to convert these Makefiles +# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake + +if (${CORE} STREQUAL "PPC440") + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") +endif () + +if (${CORE} STREQUAL "PPC440FP2") + set(STATIC_ALLOCATION 1) +endif () + +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(NO_AFFINITY 1) +endif () + +if (NOT ${ARCH} STREQUAL "x86_64" AND NOT ${ARCH} STREQUAL "x86" AND NOT ${CORE} STREQUAL "LOONGSON3B") + set(NO_AFFINITY 1) +endif () + +if (NO_AFFINITY) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AFFINITY") +endif () + +if (FUNCTION_PROFILE) + set(CCOMMON_OPT "${CCOMMON_OPT} -DFUNCTION_PROFILE") +endif () + +if (HUGETLB_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLB") +endif () + +if (DEFINED HUGETLBFILE_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=${HUGETLBFILE_ALLOCATION})") +endif () + +if (STATIC_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_STATIC") +endif () + +if (DEVICEDRIVER_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"") +endif () + +if (MIXED_MEMORY_ALLOCATION) + set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "SunOS") + set(TAR gtar) + set(PATCH gpatch) + set(GREP ggrep) +else () + set(TAR tar) + set(PATCH patch) + set(GREP grep) +endif () + +if (NOT DEFINED MD5SUM) + set(MD5SUM md5sum) +endif () + +set(AWK awk) + +set(REVISION "-r${OpenBLAS_VERSION}") +set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) + +if (DEBUG) + set(COMMON_OPT "${COMMON_OPT} -g") +endif () + +if (NOT DEFINED COMMON_OPT) + set(COMMON_OPT "-O2") +endif () + +#For x86 32-bit +if (DEFINED BINARY AND BINARY EQUAL 32) +if (NOT MSVC) + set(COMMON_OPT "${COMMON_OPT} -m32") +endif() +endif() + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +if(NOT MSVC) +set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") +endif() +# TODO: not sure what PFLAGS is -hpa +set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") + +set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${COMMON_OPT} ${FCOMMON_OPT}") +# TODO: not sure what FPFLAGS is -hpa +set(FPFLAGS "${FPFLAGS} ${COMMON_OPT} ${FCOMMON_OPT} ${COMMON_PROF}") + +#For LAPACK Fortran codes. +set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") +set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") + +#Disable -fopenmp for LAPACK Fortran codes on Windows. +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parralel") + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () +endif () + +if ("${F_COMPILER}" STREQUAL "GFORTRAN") + # lapack-netlib is rife with uninitialized warnings -hpa + set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized") +endif () + +set(LAPACK_CFLAGS "${CMAKE_C_CFLAGS} -DHAVE_LAPACK_CONFIG_H") +if (INTERFACE64) + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_ILP64") +endif () + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DOPENBLAS_OS_WINDOWS") +endif () + +if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") +endif () + +if (NOT DEFINED SUFFIX) + set(SUFFIX o) +endif () + +if (NOT DEFINED PSUFFIX) + set(PSUFFIX po) +endif () + +if (NOT DEFINED LIBSUFFIX) + set(LIBSUFFIX a) +endif () + +if (DYNAMIC_ARCH) + if (DEFINED SMP) + set(LIBNAME "${LIBPREFIX}p${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}p${REVISION}_p.${LIBSUFFIX}") + else () + set(LIBNAME "${LIBPREFIX}${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}${REVISION}_p.${LIBSUFFIX}") + endif () +else () + if (DEFINED SMP) + set(LIBNAME "${LIBPREFIX}_${LIBCORE}p${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}p${REVISION}_p.${LIBSUFFIX}") + else () + set(LIBNAME "${LIBPREFIX}_${LIBCORE}${REVISION}.${LIBSUFFIX}") + set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}${REVISION}_p.${LIBSUFFIX}") + endif () +endif () + + +set(LIBDLLNAME "${LIBPREFIX}.dll") +set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") +set(LIBDYNNAME "${LIBNAME}.${LIBSUFFIX}.dylib") +set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") +set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") +set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") + +set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") +set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") + + +set(LIB_COMPONENTS BLAS) +if (NOT NO_CBLAS) + set(LIB_COMPONENTS "${LIB_COMPONENTS} CBLAS") +endif () + +if (NOT NO_LAPACK) + set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACK") + if (NOT NO_LAPACKE) + set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACKE") + endif () +endif () + +if (ONLY_CBLAS) + set(LIB_COMPONENTS CBLAS) +endif () + + +# For GEMM3M +set(USE_GEMM3M 0) + +if (DEFINED ARCH) + if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") + set(USE_GEMM3M 1) + endif () + + if (${CORE} STREQUAL "generic") + set(USE_GEMM3M 0) + endif () +endif () + + +#export OSNAME +#export ARCH +#export CORE +#export LIBCORE +#export PGCPATH +#export CONFIG +#export CC +#export FC +#export BU +#export FU +#export NEED2UNDERSCORES +#export USE_THREAD +#export NUM_THREADS +#export NUM_CORES +#export SMP +#export MAKEFILE_RULE +#export NEED_PIC +#export BINARY +#export BINARY32 +#export BINARY64 +#export F_COMPILER +#export C_COMPILER +#export USE_OPENMP +#export CROSS +#export CROSS_SUFFIX +#export NOFORTRAN +#export NO_FBLAS +#export EXTRALIB +#export CEXTRALIB +#export FEXTRALIB +#export HAVE_SSE +#export HAVE_SSE2 +#export HAVE_SSE3 +#export HAVE_SSSE3 +#export HAVE_SSE4_1 +#export HAVE_SSE4_2 +#export HAVE_SSE4A +#export HAVE_SSE5 +#export HAVE_AVX +#export HAVE_VFP +#export HAVE_VFPV3 +#export HAVE_VFPV4 +#export HAVE_NEON +#export KERNELDIR +#export FUNCTION_PROFILE +#export TARGET_CORE +# +#export SGEMM_UNROLL_M +#export SGEMM_UNROLL_N +#export DGEMM_UNROLL_M +#export DGEMM_UNROLL_N +#export QGEMM_UNROLL_M +#export QGEMM_UNROLL_N +#export CGEMM_UNROLL_M +#export CGEMM_UNROLL_N +#export ZGEMM_UNROLL_M +#export ZGEMM_UNROLL_N +#export XGEMM_UNROLL_M +#export XGEMM_UNROLL_N +#export CGEMM3M_UNROLL_M +#export CGEMM3M_UNROLL_N +#export ZGEMM3M_UNROLL_M +#export ZGEMM3M_UNROLL_N +#export XGEMM3M_UNROLL_M +#export XGEMM3M_UNROLL_N + + +#if (USE_CUDA) +# export CUDADIR +# export CUCC +# export CUFLAGS +# export CULIB +#endif + +#.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f +# +#.f.$(SUFFIX): +# $(FC) $(FFLAGS) -c $< -o $(@F) +# +#.f.$(PSUFFIX): +# $(FC) $(FPFLAGS) -pg -c $< -o $(@F) + +# these are not cross-platform +#ifdef BINARY64 +#PATHSCALEPATH = /opt/pathscale/lib/3.1 +#PGIPATH = /opt/pgi/linux86-64/7.1-5/lib +#else +#PATHSCALEPATH = /opt/pathscale/lib/3.1/32 +#PGIPATH = /opt/pgi/linux86/7.1-5/lib +#endif + +#ACMLPATH = /opt/acml/4.3.0 +#ifneq ($(OSNAME), Darwin) +#MKLPATH = /opt/intel/mkl/10.2.2.025/lib +#else +#MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib +#endif +#ATLASPATH = /opt/atlas/3.9.17/opteron +#FLAMEPATH = $(HOME)/flame/lib +#ifneq ($(OSNAME), SunOS) +#SUNPATH = /opt/sunstudio12.1 +#else +#SUNPATH = /opt/SUNWspro +#endif + diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 000000000..6e2a98069 --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,346 @@ +# Functions to help with the OpenBLAS build + +# Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE +function(ParseGetArchVars GETARCH_IN) + string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}") + foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) + # split the line into var and value, then assign the value to a CMake var + string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") + list(GET SPLIT_VAR 0 VAR_NAME) + list(GET SPLIT_VAR 1 VAR_VALUE) + set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) + endforeach () +endfunction () + +# Reads a Makefile into CMake vars. +macro(ParseMakefileVars MAKEFILE_IN) + message(STATUS "Reading vars from ${MAKEFILE_IN}...") + file(STRINGS ${MAKEFILE_IN} makefile_contents) + foreach (makefile_line ${makefile_contents}) + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + set(var_name ${CMAKE_MATCH_1}) + set(var_value ${CMAKE_MATCH_2}) + # check for Makefile variables in the string, e.g. $(TSUFFIX) + string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) + foreach (make_var ${make_var_matches}) + # strip out Makefile $() markup + string(REGEX REPLACE "\\$\\(([0-9_a-zA-Z]+)\\)" "\\1" make_var ${make_var}) + # now replace the instance of the Makefile variable with the value of the CMake variable (note the double quote) + string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) + endforeach () + set(${var_name} ${var_value}) + else () + string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + endif () + endif () + endforeach () +endmacro () + +# Returns all combinations of the input list, as a list with colon-separated combinations +# E.g. input of A B C returns A B C A:B A:C B:C +# N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). +# #param absent_codes codes to use when an element is absent from a combination. For example, if you have TRANS;UNIT;UPPER you may want the code to be NNL when nothing is present. +# @returns LIST_OUT a list of combinations +# CODES_OUT a list of codes corresponding to each combination, with N meaning the item is not present, and the first letter of the list item meaning it is presen +function(AllCombinations list_in absent_codes_in) + list(LENGTH list_in list_count) + set(num_combos 1) + # subtract 1 since we will iterate from 0 to num_combos + math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") + set(LIST_OUT "") + set(CODES_OUT "") + foreach (c RANGE 0 ${num_combos}) + + set(current_combo "") + set(current_code "") + + # this is a little ridiculous just to iterate through a list w/ indices + math(EXPR last_list_index "${list_count} - 1") + foreach (list_index RANGE 0 ${last_list_index}) + math(EXPR bit "1 << ${list_index}") + math(EXPR combo_has_bit "${c} & ${bit}") + list(GET list_in ${list_index} list_elem) + if (combo_has_bit) + if (current_combo) + set(current_combo "${current_combo}:${list_elem}") + else () + set(current_combo ${list_elem}) + endif () + string(SUBSTRING ${list_elem} 0 1 code_char) + else () + list(GET absent_codes_in ${list_index} code_char) + endif () + set(current_code "${current_code}${code_char}") + endforeach () + + if (current_combo STREQUAL "") + list(APPEND LIST_OUT " ") # Empty set is a valid combination, but CMake isn't appending the empty string for some reason, use a space + else () + list(APPEND LIST_OUT ${current_combo}) + endif () + list(APPEND CODES_OUT ${current_code}) + + endforeach () + + set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) + set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) +endfunction () + +# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# @param sources_in the source files to build from +# @param defines_in (optional) preprocessor definitions that will be applied to all objects +# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. +# e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" +# @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) +# @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) +# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) +# @param complex_filename_scheme some routines have separate source files for complex and non-complex float types. +# 0 - compiles for all types +# 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) +# 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) +# 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c) +# 4 - compiles for complex types only, but changes source names for complex by prepending z (e.g. hemv.c becomes zhemv.c) +# STRING - compiles only the given type (e.g. DOUBLE) +function(GenerateNamedObjects sources_in) + + if (DEFINED ARGV1) + set(defines_in ${ARGV1}) + endif () + + if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") + set(name_in ${ARGV2}) + # strip off extension for kernel files that pass in the object name. + get_filename_component(name_in ${name_in} NAME_WE) + endif () + + if (DEFINED ARGV3) + set(use_cblas ${ARGV3}) + else () + set(use_cblas false) + endif () + + if (DEFINED ARGV4) + set(replace_last_with ${ARGV4}) + endif () + + if (DEFINED ARGV5) + set(append_with ${ARGV5}) + endif () + + if (DEFINED ARGV6) + set(no_float_type ${ARGV6}) + else () + set(no_float_type false) + endif () + + if (no_float_type) + set(float_list "DUMMY") # still need to loop once + else () + set(float_list "${FLOAT_TYPES}") + endif () + + set(real_only false) + set(complex_only false) + set(mangle_complex_sources false) + if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") + if (${ARGV7} EQUAL 1) + set(real_only true) + elseif (${ARGV7} EQUAL 2) + set(complex_only true) + elseif (${ARGV7} EQUAL 3) + set(mangle_complex_sources true) + elseif (${ARGV7} EQUAL 4) + set(mangle_complex_sources true) + set(complex_only true) + elseif (NOT ${ARGV7} EQUAL 0) + set(float_list ${ARGV7}) + endif () + endif () + + if (complex_only) + list(REMOVE_ITEM float_list "SINGLE") + list(REMOVE_ITEM float_list "DOUBLE") + elseif (real_only) + list(REMOVE_ITEM float_list "COMPLEX") + list(REMOVE_ITEM float_list "ZCOMPLEX") + endif () + + set(float_char "") + set(OBJ_LIST_OUT "") + foreach (float_type ${float_list}) + foreach (source_file ${sources_in}) + + if (NOT no_float_type) + string(SUBSTRING ${float_type} 0 1 float_char) + string(TOLOWER ${float_char} float_char) + endif () + + if (NOT name_in) + get_filename_component(source_name ${source_file} NAME_WE) + set(obj_name "${float_char}${source_name}") + else () + # replace * with float_char + if (${name_in} MATCHES "\\*") + string(REPLACE "*" ${float_char} obj_name ${name_in}) + else () + set(obj_name "${float_char}${name_in}") + endif () + endif () + + if (replace_last_with) + string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name}) + else () + set(obj_name "${obj_name}${append_with}") + endif () + + # now add the object and set the defines + set(obj_defines ${defines_in}) + + if (use_cblas) + set(obj_name "cblas_${obj_name}") + list(APPEND obj_defines "CBLAS") + endif () + + list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") + if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") + list(APPEND obj_defines "DOUBLE") + endif () + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + list(APPEND obj_defines "COMPLEX") + if (mangle_complex_sources) + # add a z to the filename + get_filename_component(source_name ${source_file} NAME) + get_filename_component(source_dir ${source_file} DIRECTORY) + string(REPLACE ${source_name} "z${source_name}" source_file ${source_file}) + endif () + endif () + + if (VERBOSE_GEN) + message(STATUS "${obj_name}:${source_file}") + message(STATUS "${obj_defines}") + endif () + + # create a copy of the source to avoid duplicate obj filename problem with ar.exe + get_filename_component(source_extension ${source_file} EXT) + set(new_source_file "${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${obj_name}${source_extension}") + if (IS_ABSOLUTE ${source_file}) + set(old_source_file ${source_file}) + else () + set(old_source_file "${CMAKE_CURRENT_LIST_DIR}/${source_file}") + endif () + + string(REPLACE ";" "\n#define " define_source "${obj_defines}") + string(REPLACE "=" " " define_source "${define_source}") + file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"") + list(APPEND SRC_LIST_OUT ${new_source_file}) + + endforeach () + endforeach () + + list(APPEND OPENBLAS_SRC ${SRC_LIST_OUT}) + set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) +endfunction () + +# generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in +# @param sources_in the source files to build from +# @param defines_in the preprocessor definitions that will be combined to create the object files +# @param all_defines_in (optional) preprocessor definitions that will be applied to all objects +# @param replace_scheme If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k.c with TRANS and UNIT defined will be symm_TU. +# If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. +# If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU. +# If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). +# If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel +# @param alternate_name replaces the source name as the object name (define codes are still appended) +# @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) +# @param complex_filename_scheme see GenerateNamedObjects +function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) + + set(alternate_name_in "") + if (DEFINED ARGV5) + set(alternate_name_in ${ARGV5}) + endif () + + set(no_float_type false) + if (DEFINED ARGV6) + set(no_float_type ${ARGV6}) + endif () + + set(complex_filename_scheme "") + if (DEFINED ARGV7) + set(complex_filename_scheme ${ARGV7}) + endif () + + AllCombinations("${defines_in}" "${absent_codes_in}") + set(define_combos ${LIST_OUT}) + set(define_codes ${CODES_OUT}) + + list(LENGTH define_combos num_combos) + math(EXPR num_combos "${num_combos} - 1") + + foreach (c RANGE 0 ${num_combos}) + + list(GET define_combos ${c} define_combo) + list(GET define_codes ${c} define_code) + + foreach (source_file ${sources_in}) + + set(alternate_name ${alternate_name_in}) + + # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with + string(REPLACE ":" ";" define_combo ${define_combo}) + + # now add the object and set the defines + set(cur_defines ${define_combo}) + if ("${cur_defines}" STREQUAL " ") + set(cur_defines ${all_defines_in}) + else () + list(APPEND cur_defines ${all_defines_in}) + endif () + + set(replace_code "") + set(append_code "") + if (replace_scheme EQUAL 1) + set(replace_code ${define_code}) + else () + if (replace_scheme EQUAL 2) + set(append_code "_${define_code}") + elseif (replace_scheme EQUAL 3) + if ("${alternate_name}" STREQUAL "") + string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file}) + else () + string(REGEX MATCH "[a-zA-Z]$" last_letter ${alternate_name}) + endif () + # first extract the last letter + string(SUBSTRING ${last_letter} 0 1 last_letter) # remove period from match + # break the code up into the first letter and the remaining (should only be 2 anyway) + string(SUBSTRING ${define_code} 0 1 define_code_first) + string(SUBSTRING ${define_code} 1 -1 define_code_second) + set(replace_code "${define_code_first}${last_letter}${define_code_second}") + elseif (replace_scheme EQUAL 4) + # insert code before the last underscore and pass that in as the alternate_name + if ("${alternate_name}" STREQUAL "") + get_filename_component(alternate_name ${source_file} NAME_WE) + endif () + set(extra_underscore "") + # check if filename has two underscores, insert another if not (e.g. getrs_parallel needs to become getrs_U_parallel not getrsU_parallel) + string(REGEX MATCH "_[a-zA-Z]+_" underscores ${alternate_name}) + string(LENGTH "${underscores}" underscores) + if (underscores EQUAL 0) + set(extra_underscore "_") + endif () + string(REGEX REPLACE "(.+)(_[^_]+)$" "\\1${extra_underscore}${define_code}\\2" alternate_name ${alternate_name}) + else() + set(append_code ${define_code}) # replace_scheme should be 0 + endif () + endif () + + GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" false "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") + endforeach () + endforeach () + + set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) +endfunction () + diff --git a/common.h b/common.h index fe2083469..7b81c6fb6 100644 --- a/common.h +++ b/common.h @@ -82,7 +82,10 @@ extern "C" { #include #include #include + +#if !defined(_MSC_VER) #include +#endif #ifdef OS_LINUX #include @@ -93,6 +96,14 @@ extern "C" { #include #endif +#ifdef OS_ANDROID +#define NO_SYSV_IPC +//Android NDK only supports complex.h since Android 5.0 +#if __ANDROID_API__ < 21 +#define FORCE_OPENBLAS_COMPLEX_STRUCT +#endif +#endif + #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM @@ -106,8 +117,11 @@ extern "C" { #endif #else #include +#ifndef NO_SYSV_IPC #include +#endif #include +#include #include #include #ifdef SMP @@ -287,13 +301,6 @@ typedef int blasint; #define COMPSIZE 2 #endif -#if defined(C_PGI) || defined(C_SUN) -#define CREAL(X) (*((FLOAT *)&X + 0)) -#define CIMAG(X) (*((FLOAT *)&X + 1)) -#else -#define CREAL __real__ -#define CIMAG __imag__ -#endif #define Address_H(x) (((x)+(1<<15))>>16) #define Address_L(x) ((x)-((Address_H(x))<<16)) @@ -307,8 +314,12 @@ typedef int blasint; #endif #if defined(OS_WINDOWS) +#if defined(_MSC_VER) && !defined(__clang__) +#define YIELDING YieldProcessor() +#else #define YIELDING SwitchToThread() #endif +#endif #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); @@ -404,7 +415,51 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif + +#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) +#ifdef _POSIX_MONOTONIC_CLOCK +#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) +#if __GLIBC_PREREQ(2, 17) // don't require -lrt +#define USE_MONOTONIC #endif +#elif defined(OS_ANDROID) +#define USE_MONOTONIC +#endif +#endif +/* use similar scale as x86 rdtsc for timeouts to work correctly */ +static inline unsigned long long rpcc(void){ +#ifdef USE_MONOTONIC + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv,NULL); + return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#endif +} +#define RPCC_DEFINED +#define RPCC64BIT +#endif // !RPCC_DEFINED + +#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) +static void __inline blas_lock(volatile BLASULONG *address){ + + do { + while (*address) {YIELDING;}; + + } while (!__sync_bool_compare_and_swap(address, 0, 1)); +} +#define BLAS_LOCK_DEFINED +#endif + +#ifndef RPCC_DEFINED +#error "rpcc() implementation is missing for your platform" +#endif +#ifndef BLAS_LOCK_DEFINED +#error "blas_lock() implementation is missing for your platform" +#endif +#endif // !ASSEMBLER #ifdef OS_LINUX #include "common_linux.h" @@ -450,18 +505,52 @@ typedef char* env_var_t; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) +#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) #define OPENBLAS_COMPLEX_C99 + #ifndef __cplusplus + #include + #endif typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} #endif + +#ifdef XDOUBLE +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) +#elif defined(DOUBLE) +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_double +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i) +#else +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_float +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i) +#endif + +#if defined(C_PGI) || defined(C_SUN) +#define CREAL(X) (*((FLOAT *)&X + 0)) +#define CIMAG(X) (*((FLOAT *)&X + 1)) +#else +#ifdef OPENBLAS_COMPLEX_STRUCT +#define CREAL(Z) ((Z).real) +#define CIMAG(Z) ((Z).imag) +#else +#define CREAL __real__ +#define CIMAG __imag__ +#endif +#endif + #endif // ASSEMBLER #ifndef IFLUSH @@ -478,6 +567,10 @@ typedef char* env_var_t; #endif #endif +#if defined(C_MSVC) +#define inline __inline +#endif + #ifndef ASSEMBLER #ifndef MIN @@ -499,6 +592,8 @@ void blas_set_parameter(void); int blas_get_cpu_number(void); void *blas_memory_alloc (int); void blas_memory_free (void *); +void *blas_memory_alloc_nolock (int); //use malloc without blas_lock +void blas_memory_free_nolock (void *); int get_num_procs (void); diff --git a/common_alpha.h b/common_alpha.h index 845fb316a..9739c941d 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -76,6 +76,7 @@ static void __inline blas_lock(unsigned long *address){ "30:", address); #endif } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void){ @@ -89,6 +90,7 @@ static __inline unsigned int rpcc(void){ return r0; } +#define RPCC_DEFINED #define HALT ldq $0, 0($0) diff --git a/common_arm.h b/common_arm.h index 3cf15848a..6bf836835 100644 --- a/common_arm.h +++ b/common_arm.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,56 +30,29 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - #ifndef COMMON_ARM #define COMMON_ARM +#if defined(ARMV5) || defined(ARMV6) + #define MB #define WMB +#else + +#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") +#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") + +#endif + #define INLINE inline #define RETURN_BY_COMPLEX #ifndef ASSEMBLER +#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) + static void __inline blas_lock(volatile BLASULONG *address){ int register ret; @@ -88,37 +61,29 @@ static void __inline blas_lock(volatile BLASULONG *address){ while (*address) {YIELDING;}; __asm__ __volatile__( - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) - : "1"(address) - : "memory", "r2" , "r3" - - + "ldrex r2, [%1] \n\t" + "strex %0, %2, [%1] \n\t" + "orr %0, r2 \n\t" + : "=&r"(ret) + : "r"(address), "r"(1) + : "memory", "r2" ); } while (ret); - + MB; } - -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} +#define BLAS_LOCK_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } -#if defined(DOUBLE) +#if !defined(HAVE_VFP) +/* no FPU, soft float */ +#define GET_IMAGE(res) +#elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") @@ -166,4 +131,8 @@ REALNAME: #define MAP_ANONYMOUS MAP_ANON #endif +#if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8) +#error "you must define ARMV5, ARMV6, ARMV7 or ARMV8" +#endif + #endif diff --git a/common_arm64.h b/common_arm64.h index ae79c5309..15987c677 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,49 +30,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - #ifndef COMMON_ARM64 #define COMMON_ARM64 -#define MB -#define WMB +#define MB __asm__ __volatile__ ("dmb ish" : : : "memory") +#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") + #define INLINE inline @@ -81,39 +44,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER static void __inline blas_lock(volatile BLASULONG *address){ -/* - int register ret; + + long register ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) - : "1"(address) - : "memory", "r2" , "r3" - - + "ldaxr %0, [%1] \n\t" + "stlxr w2, %2, [%1] \n\t" + "orr %0, %0, x2 \n\t" + : "=r"(ret) + : "r"(address), "r"(1l) + : "memory", "x2" ); } while (ret); -*/ + MB; } +#define BLAS_LOCK_DEFINED -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } @@ -166,3 +117,4 @@ REALNAME: #endif #endif + diff --git a/common_c.h b/common_c.h index 741d7d087..ce0f2a5bd 100644 --- a/common_c.h +++ b/common_c.h @@ -220,6 +220,15 @@ #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc +#define CIMATCOPY_K_CN cimatcopy_k_cn +#define CIMATCOPY_K_RN cimatcopy_k_rn +#define CIMATCOPY_K_CT cimatcopy_k_ct +#define CIMATCOPY_K_RT cimatcopy_k_rt +#define CIMATCOPY_K_CNC cimatcopy_k_cnc +#define CIMATCOPY_K_RNC cimatcopy_k_rnc +#define CIMATCOPY_K_CTC cimatcopy_k_ctc +#define CIMATCOPY_K_RTC cimatcopy_k_rtc + #define CGEADD_K cgeadd_k #else @@ -403,6 +412,16 @@ #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc + +#define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn +#define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn +#define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct +#define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt +#define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc +#define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc +#define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc +#define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc + #define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index d6dfd7f04..ad9945186 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,11 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt + +#define DIMATCOPY_K_CN dimatcopy_k_cn +#define DIMATCOPY_K_RN dimatcopy_k_rn +#define DIMATCOPY_K_CT dimatcopy_k_ct +#define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k #else @@ -267,6 +272,10 @@ #define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn +#define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn +#define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct +#define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt #define DGEADD_K gotoblas -> dgeadd_k diff --git a/common_ia64.h b/common_ia64.h index 8e92b5992..72b75fc4e 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -68,6 +68,7 @@ static __inline void blas_lock(volatile unsigned long *address){ : "ar.ccv", "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void) { unsigned long clocks; @@ -75,6 +76,7 @@ static __inline unsigned long rpcc(void) { __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); return clocks; } +#define RPCC_DEFINED static __inline unsigned long stmxcsr(void){ @@ -99,10 +101,12 @@ static __inline void blas_lock(volatile unsigned long *address){ while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) ; } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); } +#define RPCC_DEFINED static __inline unsigned int stmxcsr(void) { return __getReg(_IA64_REG_AR_FPSR); diff --git a/common_level1.h b/common_level1.h index 2a1b4f1cf..32ffd6f18 100644 --- a/common_level1.h +++ b/common_level1.h @@ -47,12 +47,12 @@ double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -float _Complex cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); -float _Complex cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); -double _Complex zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); -double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); -xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +openblas_complex_double zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +openblas_complex_double zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +openblas_complex_xdouble xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +openblas_complex_xdouble xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_level3.h b/common_level3.h index e0ecbc4e2..1f5490baa 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1736,31 +1736,55 @@ int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLAS int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG); int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG); int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8555baa67..4976e766f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,6 +634,11 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT +#define IMATCOPY_K_CN DIMATCOPY_K_CN +#define IMATCOPY_K_RN DIMATCOPY_K_RN +#define IMATCOPY_K_CT DIMATCOPY_K_CT +#define IMATCOPY_K_RT DIMATCOPY_K_RT + #define GEADD_K DGEADD_K #else @@ -931,6 +936,10 @@ #define OMATCOPY_K_RN SOMATCOPY_K_RN #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K #endif @@ -1747,6 +1756,15 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define IMATCOPY_K_CN ZIMATCOPY_K_CN +#define IMATCOPY_K_RN ZIMATCOPY_K_RN +#define IMATCOPY_K_CT ZIMATCOPY_K_CT +#define IMATCOPY_K_RT ZIMATCOPY_K_RT +#define IMATCOPY_K_CNC ZIMATCOPY_K_CNC +#define IMATCOPY_K_RNC ZIMATCOPY_K_RNC +#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC +#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC + #define GEADD_K ZGEADD_K #else @@ -2160,6 +2178,14 @@ #define OMATCOPY_K_RNC COMATCOPY_K_RNC #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define IMATCOPY_K_CN CIMATCOPY_K_CN +#define IMATCOPY_K_RN CIMATCOPY_K_RN +#define IMATCOPY_K_CT CIMATCOPY_K_CT +#define IMATCOPY_K_RT CIMATCOPY_K_RT +#define IMATCOPY_K_CNC CIMATCOPY_K_CNC +#define IMATCOPY_K_RNC CIMATCOPY_K_RNC +#define IMATCOPY_K_CTC CIMATCOPY_K_CTC +#define IMATCOPY_K_RTC CIMATCOPY_K_RTC #define GEADD_K CGEADD_K diff --git a/common_mips64.h b/common_mips64.h index 7cd86b375..f5c0ec7cf 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -98,6 +98,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned int rpcc(void){ unsigned long ret; @@ -118,6 +119,7 @@ static inline unsigned int rpcc(void){ #endif return ret; } +#define RPCC_DEFINED #if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY diff --git a/common_param.h b/common_param.h index 1b56e85f0..36d6149ea 100644 --- a/common_param.h +++ b/common_param.h @@ -855,6 +855,36 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); diff --git a/common_power.h b/common_power.h index f88f527bd..ab331b04a 100644 --- a/common_power.h +++ b/common_power.h @@ -87,6 +87,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ #endif } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned long rpcc(void){ unsigned long ret; @@ -103,6 +104,7 @@ static inline unsigned long rpcc(void){ #endif } +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT @@ -495,6 +497,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ REALNAME: #define EPILOGUE .size REALNAME, .-REALNAME #else +#if _CALL_ELF == 2 +#define PROLOGUE \ + .section .text;\ + .align 6;\ + .globl REALNAME;\ + .type REALNAME, @function;\ +REALNAME: +#define EPILOGUE .size REALNAME, .-REALNAME +#else #define PROLOGUE \ .section .text;\ .align 5;\ @@ -514,6 +525,7 @@ REALNAME:;\ .size .REALNAME, .-.REALNAME; \ .section .note.GNU-stack,"",@progbits #endif +#endif #ifdef PROFILE #ifndef __64BIT__ @@ -792,4 +804,25 @@ Lmcount$lazy_ptr: #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif + +#ifdef OS_LINUX +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 8) +#else +#if _CALL_ELF == 2 +#define FRAMESLOT(X) (((X) * 8) + 96) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif +#endif + +#if defined(OS_AIX) || defined(OS_DARWIN) +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 56) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif + #endif diff --git a/common_s.h b/common_s.h index a4d8679b7..3c1600859 100644 --- a/common_s.h +++ b/common_s.h @@ -152,6 +152,10 @@ #define SOMATCOPY_K_RN somatcopy_k_rn #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SIMATCOPY_K_CN simatcopy_k_cn +#define SIMATCOPY_K_RN simatcopy_k_rn +#define SIMATCOPY_K_CT simatcopy_k_ct +#define SIMATCOPY_K_RT simatcopy_k_rt #define SGEADD_K sgeadd_k @@ -274,6 +278,10 @@ #define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SIMATCOPY_K_CN gotoblas -> simatcopy_k_cn +#define SIMATCOPY_K_RN gotoblas -> simatcopy_k_rn +#define SIMATCOPY_K_CT gotoblas -> simatcopy_k_ct +#define SIMATCOPY_K_RT gotoblas -> simatcopy_k_rt #define SGEADD_K gotoblas -> sgeadd_k diff --git a/common_sparc.h b/common_sparc.h index 87ef75276..f99972db9 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -58,6 +58,7 @@ static void __inline blas_lock(volatile unsigned long *address){ : "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void){ unsigned long clocks; @@ -66,6 +67,7 @@ static __inline unsigned long rpcc(void){ return clocks; }; +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_x86.h b/common_x86.h index 9d82090cc..1ace84cad 100644 --- a/common_x86.h +++ b/common_x86.h @@ -56,41 +56,67 @@ static void __inline blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; +#if defined(_MSC_VER) && !defined(__clang__) + // use intrinsic instead of inline assembly + ret = _InterlockedExchange(address, 1); + // inline assembly + /*__asm { + mov eax, address + mov ebx, 1 + xchg [eax], ebx + mov ret, ebx + }*/ +#else __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); +#endif } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long long rpcc(void){ +#if defined(_MSC_VER) && !defined(__clang__) + return __rdtsc(); // use MSVC intrinsic +#else unsigned int a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((unsigned long long)a + ((unsigned long long)d << 32)); +#endif }; +#define RPCC_DEFINED static __inline unsigned long getstackaddr(void){ +#if defined(_MSC_VER) && !defined(__clang__) + return (unsigned long)_ReturnAddress(); // use MSVC intrinsic +#else unsigned long addr; __asm__ __volatile__ ("mov %%esp, %0" : "=r"(addr) : : "memory"); return addr; +#endif }; static __inline long double sqrt_long(long double val) { +#if defined(_MSC_VER) && !defined(__clang__) + return sqrt(val); // not sure if this will use fsqrt +#else long double result; __asm__ __volatile__ ("fldt %1\n" "fsqrt\n" "fstpt %0\n" : "=m" (result) : "m"(val)); return result; +#endif } #define SQRT(a) sqrt_long(a) @@ -100,7 +126,7 @@ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #define WHEREAMI -static inline int WhereAmI(void){ +static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; @@ -146,9 +172,14 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; +#if defined(_MSC_VER) && !defined(__clang__) + (void*)result; + return x*y; +#else __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; +#endif } #endif @@ -171,7 +202,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif @@ -284,8 +315,12 @@ REALNAME: #define PROFCODE +#ifdef __clang__ +#define EPILOGUE .end +#else #define EPILOGUE .end REALNAME #endif +#endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ diff --git a/common_x86_64.h b/common_x86_64.h index e0a6c4c42..da9afc0e4 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -41,6 +41,10 @@ #ifndef ASSEMBLER +#ifdef C_MSVC +#include +#endif + #ifdef C_SUN #define __asm__ __asm #define __volatile__ @@ -61,30 +65,45 @@ static void __inline blas_lock(volatile BLASULONG *address){ +#ifndef C_MSVC int ret; +#else + BLASULONG ret; +#endif do { while (*address) {YIELDING;}; +#ifndef C_MSVC __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); - +#else + ret=InterlockedExchange64((volatile LONG64 *)(address), 1); +#endif } while (ret); + } +#define BLAS_LOCK_DEFINED static __inline BLASULONG rpcc(void){ +#ifdef C_MSVC + return __rdtsc(); +#else BLASULONG a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((BLASULONG)a + ((BLASULONG)d << 32)); +#endif } +#define RPCC_DEFINED #define RPCC64BIT +#ifndef C_MSVC static __inline BLASULONG getstackaddr(void){ BLASULONG addr; @@ -93,22 +112,32 @@ static __inline BLASULONG getstackaddr(void){ return addr; } +#endif static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ +#ifdef C_MSVC + int cpuinfo[4]; + __cpuid(cpuinfo, op); + *eax=cpuinfo[0]; + *ebx=cpuinfo[1]; + *ecx=cpuinfo[2]; + *edx=cpuinfo[3]; +#else __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op)); +#endif } /* #define WHEREAMI */ -static inline int WhereAmI(void){ +static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; @@ -150,10 +179,14 @@ static inline int WhereAmI(void){ #define GET_IMAGE_CANCEL #ifdef SMP -#ifdef USE64BITINT +#if defined(USE64BITINT) static __inline blasint blas_quickdivide(blasint x, blasint y){ return x / y; } +#elif defined (C_MSVC) +static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ + return x / y; +} #else extern unsigned int blas_quick_divide_table[]; @@ -226,7 +259,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_z.h b/common_z.h index b17122776..b4f58bb0c 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,15 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZIMATCOPY_K_CN zimatcopy_k_cn +#define ZIMATCOPY_K_RN zimatcopy_k_rn +#define ZIMATCOPY_K_CT zimatcopy_k_ct +#define ZIMATCOPY_K_RT zimatcopy_k_rt +#define ZIMATCOPY_K_CNC zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC zimatcopy_k_rtc + #define ZGEADD_K zgeadd_k #else @@ -404,6 +413,15 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZIMATCOPY_K_CN gotoblas -> zimatcopy_k_cn +#define ZIMATCOPY_K_RN gotoblas -> zimatcopy_k_rn +#define ZIMATCOPY_K_CT gotoblas -> zimatcopy_k_ct +#define ZIMATCOPY_K_RT gotoblas -> zimatcopy_k_rt +#define ZIMATCOPY_K_CNC gotoblas -> zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC gotoblas -> zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC gotoblas -> zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC gotoblas -> zimatcopy_k_rtc + #define ZGEADD_K gotoblas -> zgeadd_k #endif diff --git a/cpuid.h b/cpuid.h index ab6a3fb32..e9bd2d016 100644 --- a/cpuid.h +++ b/cpuid.h @@ -39,6 +39,10 @@ #ifndef CPUID_H #define CPUID_H +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#define INTEL_AMD +#endif + #define VENDOR_INTEL 1 #define VENDOR_UMC 2 #define VENDOR_AMD 3 @@ -59,7 +63,7 @@ #define FAMILY_PM 7 #define FAMILY_IA64 8 -#if defined(__i386__) || defined(__x86_64__) +#ifdef INTEL_AMD #define GET_EXFAMILY 1 #define GET_EXMODEL 2 #define GET_TYPE 3 @@ -109,6 +113,7 @@ #define CORE_PILEDRIVER 23 #define CORE_HASWELL 24 #define CORE_STEAMROLLER 25 +#define CORE_EXCAVATOR 26 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -203,5 +208,6 @@ typedef struct { #define CPUTYPE_PILEDRIVER 47 #define CPUTYPE_HASWELL 48 #define CPUTYPE_STEAMROLLER 49 +#define CPUTYPE_EXCAVATOR 50 #endif diff --git a/cpuid_arm.c b/cpuid_arm.c index 51ba72d70..6485003f3 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -192,6 +192,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA9: printf("#define CORTEXA9\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); @@ -207,6 +208,7 @@ void get_cpuconfig(void) case CPU_CORTEXA15: printf("#define CORTEXA15\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); diff --git a/cpuid_power.c b/cpuid_power.c index 2fc333dd2..366c6ed08 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -115,6 +115,7 @@ int detect(void){ if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; diff --git a/cpuid_x86.c b/cpuid_x86.c index ef90b26d8..a65991041 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -40,6 +40,12 @@ #include #include "cpuid.h" +#if defined(_MSC_VER) && !defined(__clang__) +#define C_INLINE __inline +#else +#define C_INLINE inline +#endif + /* #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM @@ -53,12 +59,26 @@ #endif */ +#if defined(_MSC_VER) && !defined(__clang__) + +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +{ + int cpuInfo[4] = {-1}; + __cpuid(cpuInfo, op); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} + +#else + #ifndef CPUIDEMU #if defined(__APPLE__) && defined(__i386__) void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #else -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ +static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #if defined(__i386__) && defined(__PIC__) __asm__ __volatile__ ("mov %%ebx, %%edi;" @@ -115,14 +135,16 @@ void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int * #endif -static inline int have_cpuid(void){ +#endif // _MSC_VER + +static C_INLINE int have_cpuid(void){ int eax, ebx, ecx, edx; cpuid(0, &eax, &ebx, &ecx, &edx); return eax; } -static inline int have_excpuid(void){ +static C_INLINE int have_excpuid(void){ int eax, ebx, ecx, edx; cpuid(0x80000000, &eax, &ebx, &ecx, &edx); @@ -130,10 +152,14 @@ static inline int have_excpuid(void){ } #ifndef NO_AVX -static inline void xgetbv(int op, int * eax, int * edx){ +static C_INLINE void xgetbv(int op, int * eax, int * edx){ //Use binary code for xgetbv +#if defined(_MSC_VER) && !defined(__clang__) + *eax = __xgetbv(op); +#else __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +#endif } #endif @@ -1098,6 +1124,16 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 13: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1112,11 +1148,57 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 7: + case 15: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; } break; + case 5: + switch (model) { + case 6: + //Broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 5: + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -1163,11 +1245,20 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; //OS don't support AVX. case 0: - if(support_avx()) - return CPUTYPE_STEAMROLLER; - else - return CPUTYPE_BARCELONA; //OS don't support AVX. + switch(exmodel){ + case 3: + if(support_avx()) + return CPUTYPE_STEAMROLLER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + case 6: + if(support_avx()) + return CPUTYPE_EXCAVATOR; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + } + break; } break; case 5: @@ -1297,6 +1388,7 @@ static char *cpuname[] = { "PILEDRIVER", "HASWELL", "STEAMROLLER", + "EXCAVATOR", }; static char *lowercpuname[] = { @@ -1349,6 +1441,7 @@ static char *lowercpuname[] = { "piledriver", "haswell", "steamroller", + "excavator", }; static char *corename[] = { @@ -1378,6 +1471,7 @@ static char *corename[] = { "PILEDRIVER", "HASWELL", "STEAMROLLER", + "EXCAVATOR", }; static char *corename_lower[] = { @@ -1407,6 +1501,7 @@ static char *corename_lower[] = { "piledriver", "haswell", "steamroller", + "excavator", }; @@ -1525,6 +1620,16 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 13: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; @@ -1539,11 +1644,57 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 7: + case 15: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; } break; + case 5: + switch (model) { + case 6: + //broadwell + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 5: + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + } + break; } break; @@ -1574,10 +1725,20 @@ int get_coretype(void){ return CORE_BARCELONA; //OS don't support AVX. case 0: - if(support_avx()) - return CORE_STEAMROLLER; - else - return CORE_BARCELONA; //OS don't support AVX. + switch(exmodel){ + case 3: + if(support_avx()) + return CORE_STEAMROLLER; + else + return CORE_BARCELONA; //OS don't support AVX. + + case 6: + if(support_avx()) + return CORE_EXCAVATOR; + else + return CORE_BARCELONA; //OS don't support AVX. + } + break; } diff --git a/ctest.c b/ctest.c index d5c224726..b5c74f137 100644 --- a/ctest.c +++ b/ctest.c @@ -44,6 +44,10 @@ COMPILER_DEC COMPILER_GNU #endif +#if defined(__ANDROID__) +OS_ANDROID +#endif + #if defined(__linux__) OS_LINUX #endif diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt new file mode 100644 index 000000000..dbe785bcb --- /dev/null +++ b/ctest/CMakeLists.txt @@ -0,0 +1,46 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +enable_language(Fortran) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") + +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh +"$1 < $2\n" +) + +foreach(float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char_upper) + string(TOLOWER ${float_char_upper} float_char) + #level1 + add_executable(x${float_char}cblat1 + c_${float_char}blat1.f + c_${float_char}blas1.c) + target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat1" + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") + + #level2 + add_executable(x${float_char}cblat2 + c_${float_char}blat2.f + c_${float_char}blas2.c + c_${float_char}2chke.c + auxiliary.c + c_xerbla.c + constant.c) + target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat2" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") + + #level3 + add_executable(x${float_char}cblat3 + c_${float_char}blat3.f + c_${float_char}blas3.c + c_${float_char}3chke.c + auxiliary.c + c_xerbla.c + constant.c) + target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}_static) + add_test(NAME "x${float_char}cblat3" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") + +endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index 1d9567150..7a5d236aa 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -27,12 +27,18 @@ ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o +ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o + ztestl1o = c_zblas1.o ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o +ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o + + + all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 @@ -115,8 +121,8 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) @@ -127,8 +133,8 @@ xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) include $(TOPDIR)/Makefile.tail diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c index 4d5de5150..3b4764c4a 100644 --- a/ctest/c_c3chke.c +++ b/ctest/c_c3chke.c @@ -46,235 +46,7 @@ void F77_c3chke(char * rout) { } - if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { - cblas_rout = "cblas_cgemm3" ; - - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - - } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { + if (strncmp( sf,"cblas_cgemm" ,11)==0) { cblas_rout = "cblas_cgemm" ; diff --git a/ctest/c_c3chke_3m.c b/ctest/c_c3chke_3m.c new file mode 100644 index 000000000..4d5de5150 --- /dev/null +++ b/ctest/c_c3chke_3m.c @@ -0,0 +1,1936 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c3chke(char * rout) { + char *sf = ( rout ) ; + float A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + + if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { + cblas_rout = "cblas_cgemm3" ; + + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + + + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { + cblas_rout = "cblas_chemm" ; + + cblas_info = 1; + cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csymm" ,11)==0) { + cblas_rout = "cblas_csymm" ; + + cblas_info = 1; + cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { + cblas_rout = "cblas_ctrmm" ; + + cblas_info = 1; + cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { + cblas_rout = "cblas_ctrsm" ; + + cblas_info = 1; + cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cherk" ,11)==0) { + cblas_rout = "cblas_cherk" ; + + cblas_info = 1; + cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { + cblas_rout = "cblas_csyrk" ; + + cblas_info = 1; + cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { + cblas_rout = "cblas_cher2k" ; + + cblas_info = 1; + cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { + cblas_rout = "cblas_csyr2k" ; + + cblas_info = 1; + cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c index f1b108c64..9f48c49b1 100644 --- a/ctest/c_cblas3.c +++ b/ctest/c_cblas3.c @@ -567,81 +567,3 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, -void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, - int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, - CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, - CBLAS_TEST_COMPLEX *c, int *ldc ) { - - CBLAS_TEST_COMPLEX *A, *B, *C; - int i,j,LDA, LDB, LDC; - enum CBLAS_TRANSPOSE transa, transb; - - get_transpose_type(transpa, &transa); - get_transpose_type(transpb, &transb); - - if (*order == TEST_ROW_MJR) { - if (transa == CblasNoTrans) { - LDA = *k+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*m; i++ ) - for( j=0; j<*k; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - else { - LDA = *m+1; - A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*k; i++ ) - for( j=0; j<*m; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - - if (transb == CblasNoTrans) { - LDB = *n+1; - B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); - for( i=0; i<*k; i++ ) - for( j=0; j<*n; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - else { - LDB = *k+1; - B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); - for( i=0; i<*n; i++ ) - for( j=0; j<*k; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - - LDC = *n+1; - C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - C[i*LDC+j].real=c[j*(*ldc)+i].real; - C[i*LDC+j].imag=c[j*(*ldc)+i].imag; - } - cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, - B, LDB, beta, C, LDC ); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - c[j*(*ldc)+i].real=C[i*LDC+j].real; - c[j*(*ldc)+i].imag=C[i*LDC+j].imag; - } - free(A); - free(B); - free(C); - } - else if (*order == TEST_COL_MJR) - cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); - else - cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); -} - - diff --git a/ctest/c_cblas3_3m.c b/ctest/c_cblas3_3m.c new file mode 100644 index 000000000..f1b108c64 --- /dev/null +++ b/ctest/c_cblas3_3m.c @@ -0,0 +1,647 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + +void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + + + +void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + + diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c index 4be4457b4..054e72360 100644 --- a/ctest/c_z3chke.c +++ b/ctest/c_z3chke.c @@ -49,237 +49,7 @@ void F77_z3chke(char * rout) { - if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { - cblas_rout = "cblas_zgemm3" ; - - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 1; - cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 2, BETA, C, 2 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 2, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, - ALPHA, A, 1, B, 2, BETA, C, 1 ); - chkxer(); - cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, - ALPHA, A, 1, B, 1, BETA, C, 1 ); - chkxer(); - - - - } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { + if (strncmp( sf,"cblas_zgemm" ,11)==0) { cblas_rout = "cblas_zgemm" ; cblas_info = 1; diff --git a/ctest/c_z3chke_3m.c b/ctest/c_z3chke_3m.c new file mode 100644 index 000000000..4be4457b4 --- /dev/null +++ b/ctest/c_z3chke_3m.c @@ -0,0 +1,1940 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z3chke(char * rout) { + char *sf = ( rout ) ; + double A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + + + + + if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { + cblas_rout = "cblas_zgemm3" ; + + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + + + } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; + + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { + cblas_rout = "cblas_zhemm" ; + + cblas_info = 1; + cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { + cblas_rout = "cblas_zsymm" ; + + cblas_info = 1; + cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { + cblas_rout = "cblas_ztrmm" ; + + cblas_info = 1; + cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { + cblas_rout = "cblas_ztrsm" ; + + cblas_info = 1; + cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zherk" ,11)==0) { + cblas_rout = "cblas_zherk" ; + + cblas_info = 1; + cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { + cblas_rout = "cblas_zsyrk" ; + + cblas_info = 1; + cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { + cblas_rout = "cblas_zher2k" ; + + cblas_info = 1; + cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { + cblas_rout = "cblas_zsyr2k" ; + + cblas_info = 1; + cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c index 46ff467d0..40afa4edf 100644 --- a/ctest/c_zblas3.c +++ b/ctest/c_zblas3.c @@ -564,80 +564,3 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, } -void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, - int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, - CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, - CBLAS_TEST_ZOMPLEX *c, int *ldc ) { - - CBLAS_TEST_ZOMPLEX *A, *B, *C; - int i,j,LDA, LDB, LDC; - enum CBLAS_TRANSPOSE transa, transb; - - get_transpose_type(transpa, &transa); - get_transpose_type(transpb, &transb); - - if (*order == TEST_ROW_MJR) { - if (transa == CblasNoTrans) { - LDA = *k+1; - A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*m; i++ ) - for( j=0; j<*k; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - else { - LDA = *m+1; - A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*k; i++ ) - for( j=0; j<*m; j++ ) { - A[i*LDA+j].real=a[j*(*lda)+i].real; - A[i*LDA+j].imag=a[j*(*lda)+i].imag; - } - } - - if (transb == CblasNoTrans) { - LDB = *n+1; - B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); - for( i=0; i<*k; i++ ) - for( j=0; j<*n; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - else { - LDB = *k+1; - B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); - for( i=0; i<*n; i++ ) - for( j=0; j<*k; j++ ) { - B[i*LDB+j].real=b[j*(*ldb)+i].real; - B[i*LDB+j].imag=b[j*(*ldb)+i].imag; - } - } - - LDC = *n+1; - C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - C[i*LDC+j].real=c[j*(*ldc)+i].real; - C[i*LDC+j].imag=c[j*(*ldc)+i].imag; - } - cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, - B, LDB, beta, C, LDC ); - for( j=0; j<*n; j++ ) - for( i=0; i<*m; i++ ) { - c[j*(*ldc)+i].real=C[i*LDC+j].real; - c[j*(*ldc)+i].imag=C[i*LDC+j].imag; - } - free(A); - free(B); - free(C); - } - else if (*order == TEST_COL_MJR) - cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); - else - cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, - b, *ldb, beta, c, *ldc ); -} - diff --git a/ctest/c_zblas3_3m.c b/ctest/c_zblas3_3m.c new file mode 100644 index 000000000..46ff467d0 --- /dev/null +++ b/ctest/c_zblas3_3m.c @@ -0,0 +1,643 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + + +void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt new file mode 100644 index 000000000..696767486 --- /dev/null +++ b/driver/level2/CMakeLists.txt @@ -0,0 +1,203 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + +# sources that need to be compiled twice, once with no flags and once with LOWER +set(UL_SOURCES + sbmv_k.c + spmv_k.c + spr_k.c + spr2_k.c + syr_k.c + syr2_k.c +) + +# sources that need to be compiled several times, for UNIT, TRANSA +set(U_SOURCES + trmv_U.c + tbmv_U.c + tbsv_U.c + tpmv_U.c + tpsv_U.c + trsv_U.c +) + +set(L_SOURCES + trmv_L.c + tbmv_L.c + tbsv_L.c + tpmv_L.c + tpsv_L.c + trsv_L.c +) + +set(UL_SMP_SOURCES + symv_thread.c + syr_thread.c + syr2_thread.c + spr_thread.c + spr2_thread.c + spmv_thread.c + sbmv_thread.c +) + +set(NU_SMP_SOURCES + trmv_thread.c + tpmv_thread.c + tbmv_thread.c +) + +set(ULVM_COMPLEX_SOURCES + hbmv_k.c + hpmv_k.c + hpr_k.c + hpr2_k.c + her_k.c + her2_k.c +) + +# objects that need LOWER set +GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) + +# gbmv uses a lowercase n and t +GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) +GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) +# c/zgbmv +GenerateNamedObjects("zgbmv_k.c" "CONJ" "gbmv_r" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ" "gbmv_c" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "XCONJ" "gbmv_o" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;XCONJ" "gbmv_u" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "CONJ;XCONJ" "gbmv_s" false "" "" "" 2) +GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ;XCONJ" "gbmv_d" false "" "" "" 2) + +# special defines for complex +foreach (float_type ${FLOAT_TYPES}) + + if (SMP) + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false ${float_type}) + + GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) + endif () + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + + foreach (u_source ${U_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${u_source}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NU" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TL" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RU" false ${float_type}) + GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CL" false ${float_type}) + endforeach () + + foreach (l_source ${L_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${l_source}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NL" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TU" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RL" false ${float_type}) + GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CU" false ${float_type}) + endforeach () + + foreach (ulvm_source ${ULVM_COMPLEX_SOURCES}) + string(REGEX MATCH "[a-z0-9]+" op_name ${ulvm_source}) + GenerateNamedObjects("z${ulvm_source}" "" "${op_name}_U" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "LOWER" "${op_name}_L" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "HEMVREV" "${op_name}_V" false "" "" false ${float_type}) + GenerateNamedObjects("z${ulvm_source}" "LOWER;HEMVREV" "${op_name}_M" false "" "" false ${float_type}) + endforeach() + + if (SMP) + + GenerateNamedObjects("gemv_thread.c" "CONJ" "gemv_thread_r" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "CONJ;TRANSA" "gemv_thread_c" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ" "gemv_thread_o" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;TRANSA" "gemv_thread_u" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ" "gemv_thread_s" false "" "" false ${float_type}) + GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ;TRANSA" "gemv_thread_d" false "" "" false ${float_type}) + + GenerateNamedObjects("gbmv_thread.c" "CONJ" "gbmv_thread_r" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "CONJ;TRANSA" "gbmv_thread_c" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ" "gbmv_thread_o" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;TRANSA" "gbmv_thread_u" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ" "gbmv_thread_s" false "" "" false ${float_type}) + GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ;TRANSA" "gbmv_thread_d" false "" "" false ${float_type}) + + GenerateNamedObjects("ger_thread.c" "" "ger_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "CONJ" "ger_thread_C" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) + + GenerateNamedObjects("sbmv_thread.c" "HEMV" "hbmv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "HEMV;LOWER" "hbmv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "HEMVREV" "hbmv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("sbmv_thread.c" "LOWER;HEMVREV" "hbmv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spmv_thread.c" "HEMV" "hpmv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "HEMV;LOWER" "hpmv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "HEMVREV" "hpmv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spmv_thread.c" "LOWER;HEMVREV" "hpmv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spr_thread.c" "HEMV" "hpr_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "HEMV;LOWER" "hpr_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "HEMVREV" "hpr_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spr_thread.c" "LOWER;HEMVREV" "hpr_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("spr2_thread.c" "HEMV" "hpr2_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "HEMV;LOWER" "hpr2_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "HEMVREV" "hpr2_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("spr2_thread.c" "LOWER;HEMVREV" "hpr2_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("symv_thread.c" "HEMV" "hemv_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "HEMV;LOWER" "hemv_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "HEMVREV" "hemv_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("symv_thread.c" "LOWER;HEMVREV" "hemv_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("syr_thread.c" "HER" "her_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "HER;LOWER" "her_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "HERREV" "her_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr_thread.c" "LOWER;HERREV" "her_thread_M" false "" "" false ${float_type}) + + GenerateNamedObjects("syr2_thread.c" "HER" "her2_thread_U" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HER;LOWER" "her2_thread_L" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "HERREV" "her2_thread_V" false "" "" false ${float_type}) + GenerateNamedObjects("syr2_thread.c" "LOWER;HERREV" "her2_thread_M" false "" "" false ${float_type}) + + foreach (nu_smp_src ${NU_SMP_SOURCES}) + string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_src}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=1" 0 "${op_name}_N" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=2" 0 "${op_name}_T" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "${op_name}_R" false ${float_type}) + GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "${op_name}_C" false ${float_type}) + endforeach () + endif () + + else () + # For real number functions + foreach (u_source ${U_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${u_source}) + GenerateCombinationObjects("${u_source}" "UNIT" "N" "" 0 "${op_name}_NU" false ${float_type}) + GenerateCombinationObjects("${u_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TL" false ${float_type}) + endforeach () + + foreach (l_source ${L_SOURCES}) + string(REGEX MATCH "[a-z]+" op_name ${l_source}) + GenerateCombinationObjects("${l_source}" "UNIT" "N" "" 0 "${op_name}_NL" false ${float_type}) + GenerateCombinationObjects("${l_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TU" false ${float_type}) + endforeach () + + if (SMP) + GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) + foreach(nu_smp_source ${NU_SMP_SOURCES}) + string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_source}) + GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "" 0 "${op_name}_N" false ${float_type}) + GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "TRANSA" 0 "${op_name}_T" false ${float_type}) + endforeach() + endif () + endif () +endforeach () + +if (SMP) + GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) +endif () + +add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index 9efe17092..ef9d58d76 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -64,7 +64,7 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index ddd475367..061454848 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -62,6 +62,11 @@ #endif #endif +#ifndef TRANSA +#define Y_DUMMY_NUM 1024 +static FLOAT y_dummy[Y_DUMMY_NUM]; +#endif + static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; @@ -99,10 +104,15 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a += n_from * lda * COMPSIZE; #ifdef TRANSA y += n_from * incy * COMPSIZE; +#else + //for split matrix row (n) direction and vector x of gemv_n + x += n_from * incx * COMPSIZE; + //store partial result for every thread + y += (m_to - m_from) * 1 * COMPSIZE * pos; #endif } - // fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d\n", m_from, m_to, n_from, n_to); + //fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d POS=%d\n", m_from, m_to, n_from, n_to, pos); GEMV(m_to - m_from, n_to - n_from, 0, *((FLOAT *)args -> alpha + 0), @@ -126,6 +136,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; +#ifndef TRANSA + int split_x=0; +#endif + #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE @@ -198,6 +212,58 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x i -= width; } +#ifndef TRANSA + //try to split matrix on row direction and x. + //Then, reduction. + if (num_cpu < nthreads) { + + //too small to split or bigger than the y_dummy buffer. + double MN = (double) m * (double) n; + if ( MN <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD)) + || m*COMPSIZE*nthreads > Y_DUMMY_NUM) + goto Outer; + + num_cpu = 0; + range[0] = 0; + + memset(y_dummy, 0, sizeof(FLOAT) * m * COMPSIZE * nthreads); + + args.ldc = 1; + args.c = (void *)y_dummy; + + //split on row (n) and x + i=n; + split_x=1; + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gemv_kernel; + queue[num_cpu].args = &args; + + queue[num_cpu].position = num_cpu; + + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + } + + Outer: +#endif + if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; @@ -206,5 +272,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x exec_blas(num_cpu, queue); } +#ifndef TRANSA + if(split_x==1){ + //reduction + for(i=0; i a; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 93a2f44d4..0f47344df 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -60,7 +60,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index 3c1249448..bbb1c50eb 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -76,7 +76,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 3b91cee45..47dc1daf9 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -81,7 +81,7 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 29e9799f6..a9dc2dc62 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -87,7 +87,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef COMPLEX FLOAT result; #else - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #endif diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c index 68d6045bd..d89932e33 100644 --- a/driver/level2/zgbmv_k.c +++ b/driver/level2/zgbmv_k.c @@ -77,7 +77,7 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOA FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; #ifdef TRANS - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif if (incy != 1) { diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c index 70e92e050..33f70d2c5 100644 --- a/driver/level2/zhbmv_k.c +++ b/driver/level2/zhbmv_k.c @@ -56,6 +56,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferX = sbmvbuffer; FLOAT temp[2]; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); @@ -93,7 +95,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -118,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -143,7 +145,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -168,7 +170,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c index 96bceaaf2..9e7ed7b0e 100644 --- a/driver/level2/zhpmv_k.c +++ b/driver/level2/zhpmv_k.c @@ -51,6 +51,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferX = gemvbuffer; FLOAT temp[2]; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #ifndef HEMVREV #ifndef LOWER if (i > 0) { - FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); + result = DOTC_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -93,7 +95,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else if (m - i > 1) { - FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -118,7 +120,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else #ifndef LOWER if (i > 0) { - FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); + result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -142,7 +144,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #else if (m - i > 1) { - FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c index 30e2f91c3..3ae74ce80 100644 --- a/driver/level2/zsbmv_k.c +++ b/driver/level2/zsbmv_k.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; + OPENBLAS_COMPLEX_FLOAT result; + if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); @@ -83,7 +85,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); @@ -100,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { - FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c index 76657eab9..432205e83 100644 --- a/driver/level2/zspmv_k.c +++ b/driver/level2/zspmv_k.c @@ -49,7 +49,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; - FLOAT _Complex result; + + OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c index 74ff0bce1..1ac1cdef1 100644 --- a/driver/level2/ztbmv_L.c +++ b/driver/level2/ztbmv_L.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c index 933275de3..9aa203396 100644 --- a/driver/level2/ztbmv_U.c +++ b/driver/level2/ztbmv_U.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c index 0726bbd16..9aa701841 100644 --- a/driver/level2/ztbsv_L.c +++ b/driver/level2/ztbsv_L.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c index d022650bc..3722b1f71 100644 --- a/driver/level2/ztbsv_U.c +++ b/driver/level2/ztbsv_U.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c index 12c254c12..47e6df56c 100644 --- a/driver/level2/ztpmv_L.c +++ b/driver/level2/ztpmv_L.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c index 59708b8b8..da911fb4e 100644 --- a/driver/level2/ztpmv_U.c +++ b/driver/level2/ztpmv_U.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c index 3b8e562ce..a497e42a4 100644 --- a/driver/level2/ztpsv_L.c +++ b/driver/level2/ztpsv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c index 601ac2f9d..28b824e3a 100644 --- a/driver/level2/ztpsv_U.c +++ b/driver/level2/ztpsv_U.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c index 63522cf81..92c86aec2 100644 --- a/driver/level2/ztrmv_L.c +++ b/driver/level2/ztrmv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index 8a4494fd7..f9671c9d6 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex temp; + OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c index 90f1c2c7d..dd3b2786e 100644 --- a/driver/level2/ztrsv_L.c +++ b/driver/level2/ztrsv_L.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c index bec8114f3..8803182a8 100644 --- a/driver/level2/ztrsv_U.c +++ b/driver/level2/ztrsv_U.c @@ -46,7 +46,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt new file mode 100644 index 000000000..41d440f7a --- /dev/null +++ b/driver/level3/CMakeLists.txt @@ -0,0 +1,115 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa + +# loop through gemm.c defines +set(GEMM_DEFINES NN NT TN TT) +set(GEMM_COMPLEX_DEFINES RN CN RT CT NR TR RR CR NC TC RC CC) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) + endif () +endforeach () + + +set(TRMM_TRSM_SOURCES + trmm_L.c + trmm_R.c + trsm_L.c + trsm_R.c) + +foreach(trmm_trsm_source ${TRMM_TRSM_SOURCES}) + string(REGEX MATCH "[a-z]+_[A-Z]+" op_name ${trmm_trsm_source}) + GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "" 0 "${op_name}N") + GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "TRANSA" 0 "${op_name}T") +endforeach() + +GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "NN" 1) +GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "" 1) +GenerateCombinationObjects("syr2k_k.c" "LOWER;TRANS" "U;N" "" 1) +GenerateCombinationObjects("syrk_kernel.c" "LOWER" "U" "" 2) +GenerateCombinationObjects("syr2k_kernel.c" "LOWER" "U" "" 2) +if (SMP) + + # N.B. these do NOT have a float type (e.g. DOUBLE) defined! + GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" 0 "" "" 1) + + if (NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "THREADED_LEVEL3" 2 "syrk_thread") + GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "THREADED_LEVEL3;NN" 2 "symm_thread") + endif () +endif () + +foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateCombinationObjects("zherk_kernel.c" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) + # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) + # Need to set CONJ for trmm and trsm + GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) + GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type}) + GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_RR" false ${float_type}) + GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_RC" false ${float_type}) + GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_LR" false ${float_type}) + GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_LC" false ${float_type}) + GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) + GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_RC" false ${float_type}) + + #hemm + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN" 0 "hemm_L" false ${float_type}) + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE" 0 "hemm_R" false ${float_type}) + + #her2k + GenerateCombinationObjects("zher2k_kernel.c" "LOWER;CONJ" "U;N" "" 2 "her2k_kernel" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) + + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + #hemm + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type}) + GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type}) + #her2k + GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) + GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) + endif() + + # special gemm defines for complex + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + GenerateNamedObjects("gemm.c" "${gemm_define}" "gemm_${gemm_define_LC}" false "" "" false ${float_type}) + if(USE_GEMM3M) + GenerateNamedObjects("gemm3m.c" "${gemm_define}" "gemm3m_${gemm_define_LC}" false "" "" false ${float_type}) + endif() + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false ${float_type}) + if(USE_GEMM3M) + GenerateNamedObjects("gemm3m.c" "${gemm_define};THREADED_LEVEL3" "gemm3m_thread_${gemm_define_LC}" false "" "" false ${float_type}) + endif() + endif () + endforeach () + endif () +endforeach () + +#HPLOBJS = +# dgemm_nn.c dgemm_nt.c dgemm_tn.c dgemm_tt.c +# dtrsm_LNUU.c dtrsm_LNUN.c dtrsm_LNLU.c dtrsm_LNLN.c +# dtrsm_LTUU.c dtrsm_LTUN.c dtrsm_LTLU.c dtrsm_LTLN.c +# dtrsm_RNUU.c dtrsm_RNUN.c dtrsm_RNLU.c dtrsm_RNLN.c +# dtrsm_RTUU.c dtrsm_RTUN.c dtrsm_RTLU.c dtrsm_RTLN.c +# +#if (USE_SIMPLE_THREADED_LEVEL3) +# HPLOBJS += dgemm_thread_nn.c dgemm_thread_nt.c +# dgemm_thread_tn.c dgemm_thread_tt.c +#endif +# + +add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c index 8df0f122f..09131fbdb 100644 --- a/driver/level3/syr2k_k.c +++ b/driver/level3/syr2k_k.c @@ -47,7 +47,7 @@ #endif #endif -static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { +static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c index 08751dc8b..8bc817f87 100644 --- a/driver/level3/syrk_k.c +++ b/driver/level3/syrk_k.c @@ -49,7 +49,7 @@ #endif #endif -static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { +static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index bdd9370cd..0882aa496 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -70,6 +70,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) + BLASLONG start_ls; +#endif m = args -> m; n = args -> n; @@ -226,8 +229,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } #else - BLASLONG start_ls; - for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c index 78da0eb6c..d8130ee7e 100644 --- a/driver/level3/trsm_L.c +++ b/driver/level3/trsm_L.c @@ -76,6 +76,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) + BLASLONG start_is; +#endif m = args -> m; n = args -> n; @@ -178,8 +181,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } } #else - BLASLONG start_is; - for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c index 169441d1e..f6a57f93f 100644 --- a/driver/level3/trsm_R.c +++ b/driver/level3/trsm_R.c @@ -75,6 +75,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; +#if !((defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))) + BLASLONG start_ls; +#endif m = args -> m; n = args -> n; @@ -226,8 +229,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } #else - BLASLONG start_ls; - for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt new file mode 100644 index 000000000..b2af55e36 --- /dev/null +++ b/driver/others/CMakeLists.txt @@ -0,0 +1,75 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +if (${CORE} STREQUAL "PPC440") + set(MEMORY memory_qalloc.c) +else () + set(MEMORY memory.c) +endif () + +if (SMP) + + if (USE_OPENMP) + set(BLAS_SERVER blas_server_omp.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(BLAS_SERVER blas_server_win32.c) + endif () + + if (NOT DEFINED BLAS_SERVER) + set(BLAS_SERVER blas_server.c) + endif () + + set(SMP_SOURCES + ${BLAS_SERVER} + divtable.c # TODO: Makefile has -UDOUBLE + blas_l1_thread.c + ) + + if (NOT NO_AFFINITY) + list(APPEND SMP_SOURCES init.c) + endif () +endif () + +set(COMMON_SOURCES + xerbla.c + openblas_set_num_threads.c + openblas_error_handle.c + openblas_get_num_procs.c + openblas_get_num_threads.c +) + +# these need to have NAME/CNAME set, so use GenerateNamedObjects, but don't use standard name mangling +GenerateNamedObjects("abs.c" "" "c_abs" 0 "" "" 1 ) +GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) +GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) + +if (DYNAMIC_ARCH) + list(APPEND COMMON_SOURCES dynamic.c) +else () + list(APPEND COMMON_SOURCES parameter.c) +endif () + +#ifdef EXPRECISION +#COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) +#endif +# +#ifdef QUAD_PRECISION +#COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) +#endif +# +#ifdef USE_CUDA +#COMMONOBJS += cuda_init.$(SUFFIX) +#endif +# +#ifdef FUNCTION_PROFILE +#COMMONOBJS += profile.$(SUFFIX) +#endif + +#LIBOTHERS = libothers.$(LIBSUFFIX) + +#ifeq ($(DYNAMIC_ARCH), 1) +#HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +#else +#HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) +#endif + +add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b3b1ce7bd..e1c644a80 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,9 +70,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #include +#include #include +#include #endif #ifndef likely @@ -265,7 +267,7 @@ int get_node(void); static int increased_threads = 0; -static int blas_thread_server(void *arg){ +static void* blas_thread_server(void *arg){ /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; @@ -425,6 +427,10 @@ static int blas_thread_server(void *arg){ main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + WMB; + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; @@ -454,7 +460,7 @@ static int blas_thread_server(void *arg){ //pthread_exit(NULL); - return 0; + return NULL; } #ifdef MONITOR @@ -561,14 +567,23 @@ int blas_thread_init(void){ #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #else ret=pthread_create(&blas_threads[i], NULL, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #endif if(ret!=0){ - fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); - exit(1); + struct rlimit rlim; + const char *msg = strerror(ret); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); + if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { + fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " + "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max)); + } + if(0 != raise(SIGINT)) { + fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n"); + exit(EXIT_FAILURE); + } } } @@ -775,7 +790,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ stop = rpcc(); #endif - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) { + exec_blas_async_wait(num - 1, queue -> next); + + // arm: make sure results from other threads are visible + MB; + } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", @@ -823,10 +843,10 @@ void goto_set_num_threads(int num_threads) { #ifdef NEED_STACKATTR pthread_create(&blas_threads[i], &attr, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #else pthread_create(&blas_threads[i], NULL, - (void *)&blas_thread_server, (void *)i); + &blas_thread_server, (void *)i); #endif } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 60b3c72af..c41164559 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -67,6 +67,7 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; extern gotoblas_t gotoblas_STEAMROLLER; +extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #else @@ -79,6 +80,7 @@ extern gotoblas_t gotoblas_HASWELL; #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA +#define gotoblas_EXCAVATOR gotoblas_BARCELONA #endif @@ -221,6 +223,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Broadwell + if (model == 13) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; case 4: //Intel Haswell @@ -232,6 +243,44 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Broadwell + if (model == 7 || model == 15) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Skylake + if (model == 14) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 5: + //Intel Broadwell + if (model == 6) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Skylake + if (model == 14 || model == 5) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: @@ -278,12 +327,22 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 0){ - //AMD STEAMROLLER - if(support_avx()) - return &gotoblas_STEAMROLLER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + if (exmodel == 3) { + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 6) { + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } } @@ -328,6 +387,7 @@ static char *corename[] = { "Piledriver", "Haswell", "Steamroller", + "Excavator", }; char *gotoblas_corename(void) { @@ -353,6 +413,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; + if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; return corename[0]; } @@ -383,7 +444,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { - + case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); diff --git a/driver/others/memory.c b/driver/others/memory.c index 4010ec974..ba3dc8a23 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef OS_WINDOWS #include +#ifndef NO_SYSV_IPC #include +#endif #include #endif @@ -137,8 +139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#elif defined(OS_DARWIN) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) +#endif #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; @@ -169,6 +179,14 @@ int get_num_procs(void) { #endif #endif +#ifdef OS_ANDROID +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -266,7 +284,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !defined(OS_WINDOWS) && defined(SMP_SERVER) +#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -276,7 +294,7 @@ void openblas_fork_handler() int blas_get_cpu_number(void){ env_var_t p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -284,7 +302,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -308,7 +326,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -709,8 +727,6 @@ static void *alloc_shm(void *address){ return map_address; } -#endif - #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS static void alloc_hugetlb_free(struct release_t *release){ @@ -787,12 +803,12 @@ static void *alloc_hugetlb(void *address){ if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); - return -1; + return (void*)-1; } if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { CloseHandle(hToken); - return -1; + return (void*)-1; } map_address = (void *)VirtualAlloc(address, @@ -817,6 +833,8 @@ static void *alloc_hugetlb(void *address){ } #endif +#endif + #ifdef ALLOC_HUGETLBFILE static int hugetlb_pid = 0; @@ -917,12 +935,13 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER alloc_devicedirver, #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS - alloc_hugetlb, -#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ #ifdef ALLOC_SHM alloc_shm, #endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif #ifdef ALLOC_MMAP alloc_mmap, #endif @@ -1062,7 +1081,7 @@ void *blas_memory_alloc(int procpos){ } #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif @@ -1142,6 +1161,9 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif + // arm: ensure all writes are finished before other thread takes this memory + WMB; + memory[position].used = 0; #ifdef DEBUG @@ -1161,6 +1183,16 @@ void blas_memory_free(void *free_area){ return; } +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + void blas_shutdown(void){ int pos; @@ -1378,6 +1410,28 @@ void DESTRUCTOR gotoblas_quit(void) { #endif } +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} +#endif + #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) /* Don't call me; this is just work around for PGI / Sun bug */ void gotoblas_dummy_for_PGI(void) { diff --git a/exports/Makefile b/exports/Makefile index 1fdaf2213..177e975ea 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,8 +100,8 @@ else $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def endif -ifeq ($(NOFORTRAN), 2) -#only build cblas without Fortran +ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) +#only build without Fortran $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) diff --git a/f_check b/f_check index 5719faff1..e7e46886f 100644 --- a/f_check +++ b/f_check @@ -3,11 +3,11 @@ # # 1. Not specified # 1.1 Automatically detect, then check compiler -# 1.2 If no fortran compiler is detected, g77 is default with NOFORTRAN definition +# 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition # 2. Specified # 2.1 If path is correct, check compiler # 2.2 If path is not correct, but still valid compiler name, force setting -# 2.2.2 Path is not correct, invalid compiler name, then g77 is default with NOFORTRAN definition +# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition # $makefile = shift(@ARGV); @@ -25,7 +25,7 @@ $compiler = "" if $compiler eq "f77"; if ($compiler eq "") { - @lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", + @lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95", "sunf77", "sunf90", "sunf95", "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", @@ -38,6 +38,7 @@ OUTER: foreach $path (@path) { if (-x $path . "/" . $lists) { $compiler = $lists; + $compiler_bin = $lists; last OUTER; } } @@ -48,8 +49,8 @@ OUTER: if ($compiler eq "") { $nofortran = 1; - $compiler = "g77"; - $vendor = G77; + $compiler = "gfortran"; + $vendor = GFORTRAN; $bu = "_"; } else { @@ -196,8 +197,8 @@ if ($compiler eq "") { if ($vendor eq "") { $nofortran = 1; - $compiler = "g77"; - $vendor = G77; + $compiler = "gfortran"; + $vendor = GFORTRAN; $bu = "_"; $openmp = ""; } diff --git a/getarch.c b/getarch.c index ee5f55fd1..0a49fd1b3 100644 --- a/getarch.c +++ b/getarch.c @@ -69,10 +69,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) +#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64) #define OS_WINDOWS #endif +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#define INTEL_AMD +#endif + #include #include #ifdef OS_WINDOWS @@ -116,6 +120,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_POWER4 */ /* #define FORCE_POWER5 */ /* #define FORCE_POWER6 */ +/* #define FORCE_POWER7 */ +/* #define FORCE_POWER8 */ /* #define FORCE_PPCG4 */ /* #define FORCE_PPC970 */ /* #define FORCE_PPC970MP */ @@ -448,6 +454,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "STEAMROLLER" #endif +#if defined (FORCE_EXCAVATOR) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "EXCAVATOR" +#define ARCHCONFIG "-DEXCAVATOR " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ + "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" +#define LIBNAME "excavator" +#define CORENAME "EXCAVATOR" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE @@ -530,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#ifdef FORCE_POWER6 +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" @@ -732,7 +754,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA9" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA9 " \ +#define ARCHCONFIG "-DCORTEXA9 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ @@ -747,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA15" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA15 " \ +#define ARCHCONFIG "-DCORTEXA15 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ @@ -780,8 +802,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "armv5" #define CORENAME "ARMV5" #else @@ -813,7 +834,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif -#if defined(__i386__) || (__x86_64__) +#ifdef INTEL_AMD #include "cpuid_x86.c" #define OPENBLAS_SUPPORTED #endif @@ -908,7 +929,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -928,7 +949,7 @@ int main(int argc, char *argv[]){ #endif -#if defined(__i386__) || defined(__x86_64__) +#ifdef INTEL_AMD #ifndef FORCE get_sse(); #else @@ -1008,7 +1029,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt new file mode 100644 index 000000000..9ff924e5f --- /dev/null +++ b/interface/CMakeLists.txt @@ -0,0 +1,166 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + + +set(BLAS1_SOURCES + copy.c + nrm2.c +) + +set(BLAS1_REAL_ONLY_SOURCES + rotm.c rotmg.c # N.B. these do not have complex counterparts + rot.c + asum.c +) + +# these will have 'z' prepended for the complex version +set(BLAS1_MANGLED_SOURCES + axpy.c swap.c + scal.c + dot.c + rotg.c + axpby.c +) + +# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# these all have 'z' sources for complex versions +set(BLAS2_SOURCES + gemv.c ger.c + trsv.c trmv.c symv.c + syr.c syr2.c gbmv.c + sbmv.c spmv.c + spr.c spr2.c + tbsv.c tbmv.c + tpsv.c tpmv.c +) + +set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES + hemv.c hbmv.c + her.c her2.c + hpmv.c hpr.c + hpr2.c +) + +# these do not have separate 'z' sources +set(BLAS3_SOURCES + gemm.c symm.c + trsm.c syrk.c syr2k.c +) + +set(BLAS3_MANGLED_SOURCES + omatcopy.c imatcopy.c + geadd.c +) + +# generate the BLAS objs once with and once without cblas +set (CBLAS_FLAGS "") + +if (NOT DEFINED NO_FBLAS) + list(APPEND CBLAS_FLAGS 0) +endif () + +if (NOT DEFINED NO_CBLAS) + list(APPEND CBLAS_FLAGS 1) +endif () + +foreach (CBLAS_FLAG ${CBLAS_FLAGS}) + + # TODO: don't compile complex sources with cblas for now, the naming schemes are all different and they will have to be handled separately from SINGLE/DOUBLE + set(DISABLE_COMPLEX 0) + set(MANGLE_COMPLEX 3) + if (CBLAS_FLAG EQUAL 1) +# set(DISABLE_COMPLEX 1) +# set(MANGLE_COMPLEX 1) + endif () + GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) + GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) + GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) + GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) + GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + + #sdsdot, dsdot + GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") + GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") + + # trmm is trsm with a compiler flag set + GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) + + # max and imax are compiled 4 times + GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_ABS" "amax" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) + GenerateNamedObjects("max.c" "USE_MIN" "min" ${CBLAS_FLAG}) + + GenerateNamedObjects("imax.c" "" "i*max" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) + GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) + + +# complex-specific sources +foreach (float_type ${FLOAT_TYPES}) + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "" "dotu" ${CBLAS_FLAG} "" "" false ${float_type}) + + GenerateNamedObjects("symm.c" "HEMM" "hemm" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("syrk.c" "HEMM" "herk" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) + + if (USE_GEMM3M) + GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) + endif() + endif () + if (${float_type} STREQUAL "COMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "sscal" ${CBLAS_FLAG} "" "" false "COMPLEX") + GenerateNamedObjects("nrm2.c" "" "scnrm2" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("zrot.c" "" "csrot" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") + endif () + if (${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") + GenerateNamedObjects("nrm2.c" "" "dznrm2" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("zrot.c" "" "zdrot" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + endif () +endforeach () + +endforeach () + +#Special functions for CBLAS +if (NOT DEFINED NO_CBLAS) + foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + #cblas_dotc_sub cblas_dotu_sub + GenerateNamedObjects("zdot.c" "FORCE_USE_STACK" "dotu_sub" 1 "" "" false ${float_type}) + GenerateNamedObjects("zdot.c" "FORCE_USE_STACK;CONJ" "dotc_sub" 1 "" "" false ${float_type}) + endif() + endforeach () +endif() + +if (NOT DEFINED NO_LAPACK) + set(LAPACK_SOURCES + lapack/gesv.c + ) + + # prepend z for complex versions + set(LAPACK_MANGLED_SOURCES + lapack/getrf.c lapack/getrs.c lapack/potrf.c lapack/getf2.c + lapack/potf2.c lapack/laswp.c lapack/lauu2.c + lapack/lauum.c lapack/trti2.c lapack/trtri.c + ) + + GenerateNamedObjects("${LAPACK_SOURCES}") + GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) +endif () + +add_library(interface OBJECT ${OPENBLAS_SRC}) diff --git a/interface/gemm.c b/interface/gemm.c index a5a2b4724..7253b0500 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -121,6 +121,9 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *sa, *sb; #ifdef SMP + int nthreads_max; + int nthreads_avail; + double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -237,6 +240,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *sa, *sb; #ifdef SMP + int nthreads_max; + int nthreads_avail; + double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -400,15 +406,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); - int nthreads_max = num_cpu_avail(3); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(3); + nthreads_avail = nthreads_max; #ifndef COMPLEX - double MNK = (double) args.m * (double) args.n * (double) args.k; + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #else - double MNK = (double) args.m * (double) args.n * (double) args.k; + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #endif diff --git a/interface/gemv.c b/interface/gemv.c index f33973ef3..0a222a645 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -38,6 +38,7 @@ #include #include "common.h" +#include "l1param.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif @@ -80,6 +81,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *buffer; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -134,6 +138,9 @@ void CNAME(enum CBLAS_ORDER order, blasint info, t; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -189,7 +196,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - + //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; @@ -211,24 +218,33 @@ void CNAME(enum CBLAS_ORDER order, #ifdef MAX_STACK_ALLOC // make it volatile because some gemv implementation (ex: dgemv_n.S) // do not restore all register - volatile int stack_alloc_size = m + n; + volatile int stack_alloc_size = 0; + //for gemv_n and gemv_t, try to allocate on stack + stack_alloc_size = m + n; +#ifdef ALIGNED_ACCESS + stack_alloc_size += 3; +#endif if(stack_alloc_size < 128) - //dgemv_n.S require a 128 bytes buffer - stack_alloc_size = 128; + //dgemv_n.S require a 128 bytes buffer + stack_alloc_size = 128; + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) - stack_alloc_size = 0; + stack_alloc_size = 0; + FLOAT stack_buffer[stack_alloc_size]; buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); + // printf("stack_alloc_size=%d\n", stack_alloc_size); #else + //Original OpenBLAS/GotoBLAS codes. buffer = (FLOAT *)blas_memory_alloc(1); #endif #ifdef SMP - int nthreads_max = num_cpu_avail(2); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(2); + nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; + MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; @@ -251,10 +267,13 @@ void CNAME(enum CBLAS_ORDER order, #endif #ifdef MAX_STACK_ALLOC - if(!stack_alloc_size) -#endif + if(!stack_alloc_size){ blas_memory_free(buffer); - + } +#else + blas_memory_free(buffer); +#endif + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 89f0ec823..f4309a85c 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -50,6 +51,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef malloc #undef free +/* Enables the New IMATCOPY code with inplace operation if lda == ldb */ +#define NEW_IMATCOPY + #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -75,7 +79,6 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { - char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; @@ -117,6 +120,34 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } +#ifdef NEW_IMATCOPY + if ( *lda == *ldb ) { + if ( order == BlasColMajor ) + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + } + } + else + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + } + } + return; + } + +#endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); diff --git a/interface/imax.c b/interface/imax.c index 55ffa7c6e..4378f1e22 100644 --- a/interface/imax.c +++ b/interface/imax.c @@ -136,6 +136,8 @@ blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ ret = (blasint)MAX_K(n, x, incx); + if(ret > n) ret=n; + FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; @@ -159,6 +161,8 @@ CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ ret = MAX_K(n, x, incx); + if (ret > n) ret=n; + if (ret) ret --; FUNCTION_PROFILE_END(COMPSIZE, n, 0); diff --git a/interface/rotg.c b/interface/rotg.c index 49088ab02..a0e6efdab 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -14,8 +14,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #endif - -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da = *DA; long double db = *DB; diff --git a/interface/trsm.c b/interface/trsm.c index 266372988..3d4aed282 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -362,6 +362,12 @@ void CNAME(enum CBLAS_ORDER order, mode |= (side << BLAS_RSIDE_SHIFT); args.nthreads = num_cpu_avail(3); + if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + args.nthreads = 1; + else + if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + args.nthreads = 1; + if (args.nthreads == 1) { #endif diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 9e8324432..1abb24de9 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -53,13 +53,13 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT * #endif - if (n <= 0) return; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + if (n <= 0) return; + FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; diff --git a/interface/zdot.c b/interface/zdot.c index 1380ce292..d4d0fab92 100644 --- a/interface/zdot.c +++ b/interface/zdot.c @@ -57,21 +57,25 @@ #ifdef RETURN_BY_STRUCT MYTYPE NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #elif defined RETURN_BY_STACK -void NAME(FLOAT _Complex *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +void NAME(OPENBLAS_COMPLEX_FLOAT *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #else -FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +OPENBLAS_COMPLEX_FLOAT NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #endif BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; #ifndef RETURN_BY_STACK - FLOAT _Complex ret; + OPENBLAS_COMPLEX_FLOAT ret; #endif #ifdef RETURN_BY_STRUCT MYTYPE myret; #endif +#ifndef RETURN_BY_STRUCT + OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); +#endif + PRINT_DEBUG_NAME; if (n <= 0) { @@ -80,10 +84,10 @@ FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, myret.i = 0.; return myret; #elif defined RETURN_BY_STACK - *result = ZERO; + *result = zero; return; #else - return ZERO; + return zero; #endif } @@ -144,21 +148,24 @@ FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, #else #ifdef FORCE_USE_STACK -void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT _Complex *result){ +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, OPENBLAS_COMPLEX_FLOAT *result){ #else -FLOAT _Complex CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ +OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ - FLOAT _Complex ret; + OPENBLAS_COMPLEX_FLOAT ret; + OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); #endif PRINT_DEBUG_CNAME; if (n <= 0) { #ifdef FORCE_USE_STACK - *result = ZERO; + //*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); + CREAL(*result) = 0.0; + CIMAG(*result) = 0.0; return; #else - return ZERO; + return zero; #endif } diff --git a/interface/zgemv.c b/interface/zgemv.c index 704034aaf..520136b45 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -79,6 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *buffer; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, @@ -91,14 +94,14 @@ void NAME(char *TRANS, blasint *M, blasint *N, blasint lenx, leny; blasint i; - PRINT_DEBUG_NAME; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + PRINT_DEBUG_NAME; + TOUPPER(trans); info = 0; @@ -145,6 +148,9 @@ void CNAME(enum CBLAS_ORDER order, blasint info, t; #ifdef SMP int nthreads; + int nthreads_max; + int nthreads_avail; + double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, @@ -153,14 +159,14 @@ void CNAME(enum CBLAS_ORDER order, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; - PRINT_DEBUG_CNAME; - FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); + PRINT_DEBUG_CNAME; + trans = -1; info = 0; @@ -234,10 +240,10 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - int nthreads_max = num_cpu_avail(2); - int nthreads_avail = nthreads_max; + nthreads_max = num_cpu_avail(2); + nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; + MNK = (double) m * (double) n; if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) nthreads_max = 1; diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 3f273cf13..b1e1d15dc 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -49,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 +#define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) @@ -124,6 +126,52 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, return; } +#ifdef NEW_IMATCOPY + if (*lda == *ldb) { + if ( order == BlasColMajor ) + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + else + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + return; + } +#endif + if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; else diff --git a/interface/zrotg.c b/interface/zrotg.c index e9e8a11df..187343d41 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -6,13 +6,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ - PRINT_DEBUG_NAME; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); long double da_i = *(DA + 1); @@ -22,6 +16,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double ada = fabs(da_r) + fabs(da_i); + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; @@ -54,6 +54,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ FLOAT ada = fabs(da_r) + fabs(da_i); FLOAT adb; + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; diff --git a/interface/zsyr.c b/interface/zsyr.c index 5fe29cefa..09b1de578 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -121,6 +121,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO FLOAT *buffer; int trans, uplo; blasint info; + FLOAT * ALPHA = α + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; #ifdef SMP int nthreads; #endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt new file mode 100644 index 000000000..8a3b021cc --- /dev/null +++ b/kernel/CMakeLists.txt @@ -0,0 +1,428 @@ + +include_directories(${CMAKE_SOURCE_DIR}) +include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") + +# Makefile + +if (DEFINED TARGET_CORE) + #override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + set(BUILD_KERNEL 1) + set(KDIR "") + set(TSUFFIX "_${TARGET_CORE}") +else () + set(TARGET_CORE ${CORE}) + set(KDIR "") + set(TSUFFIX "") +endif () + +SetDefaultL1() +SetDefaultL2() +SetDefaultL3() +ParseMakefileVars("${KERNELDIR}/KERNEL") +ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") + +if (${ARCH} STREQUAL "x86") +if (NOT MSVC) + GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) +else() + GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) +endif() +endif () + +# don't use float type name mangling here +GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "F_INTERFACE" "lsame" false "" "" true) +GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs1" false "" "" true) +GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" false "" "" true) + +# Makefile.L1 +foreach (float_type ${FLOAT_TYPES}) + # a bit of metaprogramming here to pull out the appropriate KERNEL var + string(SUBSTRING ${float_type} 0 1 float_char) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) + if (DEFINED ${float_char}MAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) + endif () + if (DEFINED ${float_char}MINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + endif () + GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) + if (DEFINED I${float_char}MAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) + endif () + if (DEFINED I${float_char}MINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + endif () + GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dotu_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "CONJ" "dotc_k" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) + endif () + + if (${float_type} STREQUAL "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "srot_k" false "" "" false ${float_type}) + endif() + if (${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "drot_k" false "" "" false ${float_type}) + endif() + +endforeach () + +#dsdot,sdsdot +GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") +GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + +# Makefile.L2 +GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) +GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) +foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "XCONJ" "gerv_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ;XCONJ" "gerd_k" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_U_KERNEL}" "HEMV" "hemv_U" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_L_KERNEL}" "HEMV;LOWER" "hemv_L" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_V_KERNEL}" "HEMV;HEMVREV" "hemv_V" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_M_KERNEL}" "HEMV;HEMVREV;LOWER" "hemv_M" false "" "" false ${float_type}) + + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) + endif () +endforeach () + +# Makefile.L3 +set(USE_TRMM false) + +if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell") + set(USE_TRMM true) +endif () + +foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) + + if (${float_char}GEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) + endif () + + if (${float_char}GEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "${float_type}" "${${float_char}GEMMITCOPYOBJ}" false "" "" true ${float_type}) + endif () + + if (${float_char}GEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "${float_type}" "${${float_char}GEMMONCOPYOBJ}" false "" "" true ${float_type}) + endif () + + if (${float_char}GEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "${float_type}" "${${float_char}GEMMOTCOPYOBJ}" false "" "" true ${float_type}) + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "gemm_beta" false "" "" false ${float_type}) + + if (USE_TRMM) + set(TRMM_KERNEL "${${float_char}TRMMKERNEL}") + else () + set(TRMM_KERNEL "${${float_char}GEMMKERNEL}") + endif () + + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + + # just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects + + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;CONJ;CN" "trmm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;NN" "trmm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) + + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) + + + #hemm + GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) + + # symm for c and z + GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + + else () #For real + GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) + + # symm for s and d + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + + # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. + # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. + + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) + GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + + + + if (NOT DEFINED ${float_char}OMATCOPY_CN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CN ../arm/zomatcopy_cn.c) + else () + set(${float_char}OMATCOPY_CN ../arm/omatcopy_cn.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RN ../arm/zomatcopy_rn.c) + else () + set(${float_char}OMATCOPY_RN ../arm/omatcopy_rn.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_CT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CT ../arm/zomatcopy_ct.c) + else () + set(${float_char}OMATCOPY_CT ../arm/omatcopy_ct.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RT ../arm/zomatcopy_rt.c) + else () + set(${float_char}OMATCOPY_RT ../arm/omatcopy_rt.c) + endif () + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CN}" "" "omatcopy_k_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RN}" "ROWM" "omatcopy_k_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CT}" "" "omatcopy_k_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RT}" "ROWM" "omatcopy_k_rt" false "" "" false ${float_type}) + + if (NOT DEFINED ${float_char}OMATCOPY_CNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CNC ../arm/zomatcopy_cnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RNC ../arm/zomatcopy_rnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_CTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_CTC ../arm/zomatcopy_ctc.c) + endif () + endif () + if (NOT DEFINED ${float_char}OMATCOPY_RTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}OMATCOPY_RTC ../arm/zomatcopy_rtc.c) + endif () + endif () + + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CNC}" "CONJ" "omatcopy_k_cnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RNC}" "CONJ;ROWM" "omatcopy_k_rnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CTC}" "CONJ" "omatcopy_k_ctc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RTC}" "CONJ;ROWM" "omatcopy_k_rtc" false "" "" false ${float_type}) + endif() + + #imatcopy + if (NOT DEFINED ${float_char}IMATCOPY_CN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CN ../generic/zimatcopy_cn.c) + else () + set(${float_char}IMATCOPY_CN ../generic/imatcopy_cn.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_RN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RN ../generic/zimatcopy_rn.c) + else () + set(${float_char}IMATCOPY_RN ../generic/imatcopy_rn.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_CT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CT ../generic/zimatcopy_ct.c) + else () + set(${float_char}IMATCOPY_CT ../generic/imatcopy_ct.c) + endif () + endif () + + if (NOT DEFINED ${float_char}IMATCOPY_RT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RT ../generic/zimatcopy_rt.c) + else () + set(${float_char}IMATCOPY_RT ../generic/imatcopy_rt.c) + endif () + endif () + + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CN}" "" "imatcopy_k_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RN}" "ROWM" "imatcopy_k_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CT}" "" "imatcopy_k_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RT}" "ROWM" "imatcopy_k_rt" false "" "" false ${float_type}) + + + if (NOT DEFINED ${float_char}IMATCOPY_CNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CNC ../generic/zimatcopy_cnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_RNC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RNC ../generic/zimatcopy_rnc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_CTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_CTC ../generic/zimatcopy_ctc.c) + endif () + endif () + if (NOT DEFINED ${float_char}IMATCOPY_RTC) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}IMATCOPY_RTC ../generic/zimatcopy_rtc.c) + endif () + endif () + + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CNC}" "CONJ" "imatcopy_k_cnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RNC}" "CONJ;ROWM" "imatcopy_k_rnc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CTC}" "CONJ" "imatcopy_k_ctc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RTC}" "CONJ;ROWM" "imatcopy_k_rtc" false "" "" false ${float_type}) + endif() + + #geadd + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) +endforeach () + +# Makefile.LA +#DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +add_library(kernel OBJECT ${OPENBLAS_SRC}) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index fdbae2daa..63e675b8d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif +ifeq ($(CORE), HASWELL) +USE_TRMM = 1 +endif + SKERNELOBJS += \ @@ -330,11 +334,15 @@ endif SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ @@ -342,6 +350,10 @@ CBLASOBJS += \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ @@ -349,6 +361,10 @@ ZBLASOBJS += \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) @@ -3301,6 +3317,34 @@ endif $(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef DIMATCOPY_CN +DIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RN +DIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CT +DIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RT +DIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef SOMATCOPY_CN SOMATCOPY_CN = ../arm/omatcopy_cn.c endif @@ -3329,6 +3373,34 @@ endif $(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef SIMATCOPY_CN +SIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RN +SIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CT +SIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RT +SIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef COMATCOPY_CN COMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3386,6 +3458,63 @@ endif $(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef CIMATCOPY_CN +CIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RN +CIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CT +CIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RT +CIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CNC +CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RNC +CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CTC +CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RTC +CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + ifndef ZOMATCOPY_CN ZOMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3443,6 +3572,62 @@ endif $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef ZIMATCOPY_CN +ZIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RN +ZIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CT +ZIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RT +ZIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CNC +ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RNC +ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CTC +ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RTC +ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + ifndef SGEADD_K SGEADD_K = ../generic/geadd.c @@ -3455,7 +3640,7 @@ ifndef DGEADD_K DGEADD_K = ../generic/geadd.c endif -$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) +$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef CGEADD_K diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c index 4593e2279..91ca76569 100644 --- a/kernel/arm/scal.c +++ b/kernel/arm/scal.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS j++; } - return; + return 0; } diff --git a/kernel/arm/zaxpby.c b/kernel/arm/zaxpby.c index 2e0c2940d..d9948349d 100644 --- a/kernel/arm/zaxpby.c +++ b/kernel/arm/zaxpby.c @@ -38,13 +38,16 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG ix,iy; FLOAT temp; + BLASLONG inc_x2; + BLASLONG inc_y2; + if ( n < 0 ) return(0); ix = 0; iy = 0; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; if ( beta_r == 0.0 && beta_i == 0.0) { diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c index 929ee8b54..1dcaeac27 100644 --- a/kernel/arm/zaxpy.c +++ b/kernel/arm/zaxpy.c @@ -41,6 +41,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { BLASLONG i=0; BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); @@ -48,8 +50,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, ix = 0; iy = 0; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c index f720d6ee5..07fe584c5 100644 --- a/kernel/arm/zcopy.c +++ b/kernel/arm/zcopy.c @@ -40,11 +40,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 469487531..57f47e58e 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -35,25 +35,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ #include "common.h" -#include +#ifndef _MSC_VER +#include FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; - FLOAT _Complex result; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; dot[0]=0.0; dot[1]=0.0; - __real__ result = 0.0 ; - __imag__ result = 0.0 ; + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; if ( n < 1 ) return(result); - BLASLONG inc_x2 = 2 * inc_x ; - BLASLONG inc_y2 = 2 * inc_y ; + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; while(i < n) { @@ -69,8 +75,8 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in i++ ; } - __real__ result = dot[0]; - __imag__ result = dot[1]; + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; return(result); } diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c index 356a4df72..98be68db8 100644 --- a/kernel/arm/zrot.c +++ b/kernel/arm/zrot.c @@ -41,11 +41,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n <= 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x ; - BLASLONG inc_y2 = 2 * inc_y ; + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; while(i < n) { diff --git a/kernel/arm/zswap.c b/kernel/arm/zswap.c index fcfb38506..ae4760ae0 100644 --- a/kernel/arm/zswap.c +++ b/kernel/arm/zswap.c @@ -42,11 +42,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; if ( n < 0 ) return(0); - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; while(i < n) { diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 new file mode 100644 index 000000000..6ee0c730c --- /dev/null +++ b/kernel/arm64/KERNEL.XGENE1 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file diff --git a/kernel/generic/imatcopy_cn.c b/kernel/generic/imatcopy_cn.c new file mode 100644 index 000000000..e63bc976c --- /dev/null +++ b/kernel/generic/imatcopy_cn.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2015-09-07 grisuthedragon +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda) +{ + BLASLONG i,j; + FLOAT *aptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + if ( alpha == 1.0 ) return(0); + + aptr = a; + if ( alpha == 0.0 ) + { + for ( i=0; i + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j + +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +{ + int cpuInfo[4] = {-1}; + __cpuid(cpuInfo, op); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +} +#endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 3508753ee..2dcc8658b 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -119,11 +119,11 @@ XCOPYKERNEL = zcopy.S endif ifndef SDOTKERNEL -SDOTKERNEL = ../generic/dot.c +SDOTKERNEL = ../generic/dot.c endif ifndef DSDOTKERNEL -DSDOTKERNEL = ../generic/dot.c +DSDOTKERNEL = ../generic/dot.c endif ifndef DDOTKERNEL @@ -440,10 +440,18 @@ XGEMMITCOPYOBJ = XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifndef SGEMM_BETA SGEMM_BETA = gemm_beta.S +endif +ifndef DGEMM_BETA DGEMM_BETA = gemm_beta.S +endif +ifndef CGEMM_BETA CGEMM_BETA = zgemm_beta.S +endif +ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta.S +endif QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 289529772..9f124c97f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,9 +1,14 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c @@ -26,11 +31,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S - SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S @@ -40,6 +45,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c @@ -49,6 +55,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S ZGEMMINCOPY = ZGEMMITCOPY = diff --git a/kernel/x86_64/KERNEL.EXCAVATOR b/kernel/x86_64/KERNEL.EXCAVATOR new file mode 100644 index 000000000..dbdd1fe9b --- /dev/null +++ b/kernel/x86_64/KERNEL.EXCAVATOR @@ -0,0 +1,92 @@ +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + + +DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c +SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c + +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c + +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c + +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_4.c + +DCOPYKERNEL = dcopy_bulldozer.S + + +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_2_bulldozer.S +SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a621b4484..a4686debb 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,3 +1,7 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c @@ -10,6 +14,22 @@ ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c +SSYMV_L_KERNEL = ssymv_L.c +SSYMV_U_KERNEL = ssymv_U.c +DSYMV_L_KERNEL = dsymv_L.c +DSYMV_U_KERNEL = dsymv_U.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c @@ -20,16 +40,18 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_4x4_haswell.S -DGEMMINCOPY = -DGEMMITCOPY = -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c +DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +DGEMMINCOPY = ../generic/gemm_ncopy_4.c +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CTRMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c @@ -40,6 +62,7 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 55285e3d3..5d3c7a2af 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,3 +1,13 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c @@ -7,7 +17,11 @@ ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S -DDOTKERNEL = ddot_bulldozer.S +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index ff96cd011..355d1e2f1 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,8 +1,28 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c + +SGERKERNEL = sger.c +DGERKERNEL = dger.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + +SSYMV_L_KERNEL = ssymv_L.c +SSYMV_U_KERNEL = ssymv_U.c +DSYMV_L_KERNEL = dsymv_L.c +DSYMV_U_KERNEL = dsymv_U.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index f5b5cb942..51e6d616a 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -1,9 +1,17 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 2bcd83636..a23e59f3f 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -1,3 +1,8 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -50,3 +55,111 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index be945a441..1ee0499a7 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -29,8 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(STEAMROLLER) +#include "caxpy_microk_steamroller-2.c" +#elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "caxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "caxpy_microk_sandy-2.c" #endif @@ -78,15 +84,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + BLASLONG n1 = n & -32; if ( n1 ) { da[0] = da_r; da[1] = da_i; - caxpy_kernel_8(n1, x, y , &da ); + caxpy_kernel_8(n1, x, y , da ); ix = 2 * n1; } + i = n1; while(i < n) { diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 63575c374..33bda0943 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -31,89 +31,87 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif BLASLONG register i = 0; + if ( n < 640 ) + { + __asm__ __volatile__ ( + "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif ".align 16 \n\t" "1: \n\t" - "prefetcht0 768(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x - "prefetcht0 768(%3,%0,4) \n\t" -#if !defined(CONJ) - "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t" - - "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t" "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t" - - "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t" - - "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part - "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t" - "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t" - "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t" - "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t" - "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t" + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" -#else + "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" - "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i - "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i - "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part - "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part - "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t" - "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vfmaddps 64(%3,%0,4), %%xmm0 , %%xmm12, %%xmm12 \n\t" + "vfmaddps 80(%3,%0,4), %%xmm0 , %%xmm13, %%xmm13 \n\t" + "vfmaddps 96(%3,%0,4), %%xmm0 , %%xmm14, %%xmm14 \n\t" + "vfmaddps 112(%3,%0,4), %%xmm0 , %%xmm15, %%xmm15 \n\t" - "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part - "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t" - "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmaddps %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmaddps %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmaddps %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" - "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part - "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t" - "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" - "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t" - "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - - "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t" - "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t" - "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t" - "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t" - - -#endif - - "vmovups %%xmm12, (%3,%0,4) \n\t" - "vmovups %%xmm13, 16(%3,%0,4) \n\t" - "vmovups %%xmm14, 32(%3,%0,4) \n\t" - "vmovups %%xmm15, 48(%3,%0,4) \n\t" - - "addq $16, %0 \n\t" - "subq $8 , %1 \n\t" + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" "jnz 1b \n\t" + "vzeroupper \n\t" : : @@ -121,7 +119,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (alpha) // 4 + "r" (alpha), // 4 + "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", @@ -129,7 +128,73 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,4) \n\t" + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" + + "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + } - diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c new file mode 100644 index 000000000..7a9fc1b95 --- /dev/null +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulps (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x + + "vmovups 128(%2,%0,4), %%ymm12 \n\t" // 4 complex values from x + "vmovups 160(%2,%0,4), %%ymm13 \n\t" // 4 complex values from x + "vmovups 192(%2,%0,4), %%ymm14 \n\t" // 4 complex values from x + "vmovups 224(%2,%0,4), %%ymm15 \n\t" // 4 complex values from x + + "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t" + "vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t" + "vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t" + + "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm5 \n\t" + "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm7 \n\t" + "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm9 \n\t" + "vfmadd231ps %%ymm1 , %%ymm10, %%ymm11 \n\t" + + "vpermilps $0xb1 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213ps 128(%3,%0,4), %%ymm0 , %%ymm12 \n\t" + "vfmadd213ps 160(%3,%0,4), %%ymm0 , %%ymm13 \n\t" + "vfmadd213ps 192(%3,%0,4), %%ymm0 , %%ymm14 \n\t" + "vfmadd213ps 224(%3,%0,4), %%ymm0 , %%ymm15 \n\t" + + "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm12 \n\t" + "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm13 \n\t" + "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm14 \n\t" + "vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t" + + "vmovups %%ymm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,4) \n\t" + "vmovups %%ymm9 , 64(%3,%0,4) \n\t" + "vmovups %%ymm11, 96(%3,%0,4) \n\t" + + "vmovups %%ymm12,128(%3,%0,4) \n\t" + "vmovups %%ymm13,160(%3,%0,4) \n\t" + "vmovups %%ymm14,192(%3,%0,4) \n\t" + "vmovups %%ymm15,224(%3,%0,4) \n\t" + + "addq $64, %0 \n\t" + "subq $32, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c new file mode 100644 index 000000000..dbfce208f --- /dev/null +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulps (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x + + "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulps %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulps %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulps %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "vaddps (%3,%0,4), %%ymm5 , %%ymm5 \n\t" + "vaddps 32(%3,%0,4), %%ymm7 , %%ymm7 \n\t" + "vaddps 64(%3,%0,4), %%ymm9 , %%ymm9 \n\t" + "vaddps 96(%3,%0,4), %%ymm11, %%ymm11 \n\t" + + "vmulps %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulps %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulps %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulps %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddps %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddps %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,4) \n\t" + "vmovups %%ymm9 , 64(%3,%0,4) \n\t" + "vmovups %%ymm11, 96(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c new file mode 100644 index 000000000..87370b032 --- /dev/null +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n <= 2048 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + + "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x + + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" + "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" + "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213ps 64(%3,%0,4), %%xmm0 , %%xmm12 \n\t" + "vfmadd213ps 80(%3,%0,4), %%xmm0 , %%xmm13 \n\t" + "vfmadd213ps 96(%3,%0,4), %%xmm0 , %%xmm14 \n\t" + "vfmadd213ps 112(%3,%0,4), %%xmm0 , %%xmm15 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + "vmovups %%xmm12, 64(%3,%0,4) \n\t" + "vmovups %%xmm13, 80(%3,%0,4) \n\t" + "vmovups %%xmm14, 96(%3,%0,4) \n\t" + "vmovups %%xmm15,112(%3,%0,4) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulps (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulps (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,4) \n\t" + "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" + "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" + "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,4) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,4) \n\t" + "vmovups %%xmm9 , 32(%3,%0,4) \n\t" + "vmovups %%xmm11, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + +} + diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c new file mode 100644 index 000000000..2b2c4ff7a --- /dev/null +++ b/kernel/x86_64/cdot.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) +#include "cdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#include "cdot_microk_steamroller-2.c" +#elif defined(HASWELL) +#include "cdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "cdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + dot[2] += x[j+2] * y[j+2] ; + dot[3] += x[j+3] * y[j+3] ; + dot[6] += x[j+2] * y[j+3] ; + dot[7] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[4] += x[j+4] * y[j+5] ; + dot[5] += x[j+5] * y[j+4] ; + + dot[2] += x[j+6] * y[j+6] ; + dot[3] += x[j+7] * y[j+7] ; + dot[6] += x[j+6] * y[j+7] ; + dot[7] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + d[4] = dot[4]; + d[5] = dot[5]; + d[6] = dot[6]; + d[7] = dot[7]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + cdot_kernel_16(n1, x, y , dot ); + dot[0] += dot[2]; + dot[1] += dot[3]; + dot[4] += dot[6]; + dot[5] += dot[7]; + } + + i = n1; + BLASLONG j = i * 2; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[4] += x[j] * y[j+1] ; + dot[5] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[4] += x[ix] * y[iy+1] ; + dot[5] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[4] + dot[5]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[4] - dot[5]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c new file mode 100644 index 000000000..f587aa036 --- /dev/null +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n <=1024 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c new file mode 100644 index 000000000..fc76b138a --- /dev/null +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,4), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + + "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231ps %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $32 , %0 \n\t" + "vfmadd231ps %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $16 , %1 \n\t" + "vfmadd231ps %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c new file mode 100644 index 000000000..22cd79e2e --- /dev/null +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y + + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddps %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" + + "vmovups 64(%2,%0,4), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,4), %%ymm9 \n\t" // 2 * x + + "vaddps %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5 , %%ymm11, %%ymm5 \n\t" + + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddps %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddps %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" + "addq $32 , %0 \n\t" + "subq $16 , %1 \n\t" + "vaddps %%ymm6 , %%ymm10, %%ymm6 \n\t" + "vaddps %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c new file mode 100644 index 000000000..76a3aa0eb --- /dev/null +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 1280 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y + "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y + + "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + + "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y + "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + + "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c new file mode 100644 index 000000000..5d86b1929 --- /dev/null +++ b/kernel/x86_64/cscal.c @@ -0,0 +1,452 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(HASWELL) +#include "cscal_microk_haswell-2.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) +#include "cscal_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "cscal_microk_steamroller-2.c" +#elif defined(SANDYBRIDGE) +#include "cscal_microk_bulldozer-2.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) +{ + + BLASLONG i; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + FLOAT t0,t1,t2,t3; + + for( i=0; i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1 ; + i = n1 * inc_x; + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += inc_x ; + j++; + + } + + } + + } + + return(0); + } + + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + + alpha[0] = da_r; + alpha[1] = da_i; + + if ( da_r == 0.0 ) + if ( da_i == 0 ) + cscal_kernel_16_zero(n1 , alpha , x); + else + cscal_kernel_16_zero_r(n1 , alpha , x); + else + if ( da_i == 0 ) + cscal_kernel_16_zero_i(n1 , alpha , x); + else + cscal_kernel_16(n1 , alpha , x); + + i = n1 << 1; + j = n1; + } + + + if ( da_r == 0.0 ) + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + x[i+1]=0.0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = -da_i * x[i+1]; + x[i+1] = da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + else + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + temp0 = da_r * x[i]; + x[i+1] = da_r * x[i+1]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + else + { + + BLASLONG n2 = n & -2; + + while(j < n2) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + temp1 = da_r * x[i+2] - da_i * x[i+3]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i+3] = da_r * x[i+3] + da_i * x[i+2]; + x[i] = temp0; + x[i+2] = temp1; + i += 4 ; + j+=2; + + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + + return(0); +} + + diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c new file mode 100644 index 000000000..f470cf843 --- /dev/null +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm0 \n\t" // da_r + "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 320(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups -64(%1), %%xmm4 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%xmm0 , -128(%1) \n\t" + "vmovups %%xmm0 , -112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c new file mode 100644 index 000000000..0424de3a5 --- /dev/null +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c new file mode 100644 index 000000000..763e7add4 --- /dev/null +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "prefetcht0 768(%1) \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $16, %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $16, %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index fd5343eba..56d323cbe 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,8 +31,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "daxpy_microk_steamroller-2.c" +#elif defined(PILEDRIVER) +#include "daxpy_microk_piledriver-2.c" +#elif defined(HASWELL) +#include "daxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "daxpy_microk_sandy-2.c" #endif @@ -71,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + BLASLONG n1 = n & -16; if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); @@ -89,6 +97,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + while(i < n) { diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c new file mode 100644 index 000000000..db117a8ba --- /dev/null +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastsd (%4), %%ymm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y + "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x + "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x + "vmovups %%ymm12, (%3,%0,8) \n\t" + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + "vmovups %%ymm14, 64(%3,%0,8) \n\t" + "vmovups %%ymm15, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c new file mode 100644 index 000000000..95eb953b4 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -0,0 +1,160 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "prefetcht0 512(%2,%0,8) \n\t" + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "prefetcht0 576(%2,%0,8) \n\t" + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c new file mode 100644 index 000000000..522e084dc --- /dev/null +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastsd (%4), %%ymm0 \n\t" // alpha + "vmovups (%3,%0,8), %%ymm8 \n\t" + "vmovups 32(%3,%0,8), %%ymm9 \n\t" + "vmovups 64(%3,%0,8), %%ymm10 \n\t" + "vmovups 96(%3,%0,8), %%ymm11 \n\t" + "vmovups (%2,%0,8), %%ymm4 \n\t" + "vmovups 32(%2,%0,8), %%ymm5 \n\t" + "vmovups 64(%2,%0,8), %%ymm6 \n\t" + "vmovups 96(%2,%0,8), %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" + "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" + "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" + + "vmovups (%3,%0,8), %%ymm8 \n\t" + "vmovups 32(%3,%0,8), %%ymm9 \n\t" + "vmovups 64(%3,%0,8), %%ymm10 \n\t" + "vmovups 96(%3,%0,8), %%ymm11 \n\t" + + "vmovups (%2,%0,8), %%ymm4 \n\t" + "vmovups 32(%2,%0,8), %%ymm5 \n\t" + "vmovups 64(%2,%0,8), %%ymm6 \n\t" + "vmovups 96(%2,%0,8), %%ymm7 \n\t" + + "vmovups %%ymm12, -128(%3,%0,8) \n\t" + "vmovups %%ymm13, -96(%3,%0,8) \n\t" + "vmovups %%ymm14, -64(%3,%0,8) \n\t" + "vmovups %%ymm15, -32(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" + "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" + + "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" + "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" + + "vmovups %%ymm12, -128(%3,%0,8) \n\t" + "vmovups %%ymm13, -96(%3,%0,8) \n\t" + "vmovups %%ymm14, -64(%3,%0,8) \n\t" + "vmovups %%ymm15, -32(%3,%0,8) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c new file mode 100644 index 000000000..e40009037 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -0,0 +1,160 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + if ( n < 2048 ) + { + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y + "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y + "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y + "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y + "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y + "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y + "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y + + "prefetcht0 512(%2,%0,8) \n\t" + "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x + "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x + "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x + "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x + + "prefetcht0 576(%2,%0,8) \n\t" + "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x + "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x + "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x + "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x + + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index d501c2f68..4bf8082c9 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,10 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" -#elif defined(NEHALEM) +#elif defined(STEAMROLLER) +#include "ddot_microk_steamroller-2.c" +#elif defined(PILEDRIVER) +#include "ddot_microk_piledriver-2.c" +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ddot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ddot_microk_sandy-2.c" #endif @@ -75,12 +83,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -8; + BLASLONG n1 = n & -16; if ( n1 ) ddot_kernel_8(n1, x, y , &dot ); - i = n1; while(i < n) { @@ -94,15 +101,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + while(i < n) { - dot += y[iy] * x[ix] ; + temp1 += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } + dot = temp1 + temp2; return(dot); } diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c new file mode 100644 index 000000000..d36577af3 --- /dev/null +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%ymm12, %%ymm4 \n\t" // 2 * y + "vfmadd231pd 32(%3,%0,8), %%ymm13, %%ymm5 \n\t" // 2 * y + "vfmadd231pd 64(%3,%0,8), %%ymm14, %%ymm6 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c new file mode 100644 index 000000000..ac950885c --- /dev/null +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -0,0 +1,165 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 1408 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 768(%2,%0,8) \n\t" + "prefetcht0 832(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "prefetcht0 768(%3,%0,8) \n\t" + "prefetcht0 832(%3,%0,8) \n\t" + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c new file mode 100644 index 000000000..e2e6701c7 --- /dev/null +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x + "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x + + "vmulpd (%3,%0,8), %%ymm12, %%ymm12 \n\t" // 2 * y + "vmulpd 32(%3,%0,8), %%ymm13, %%ymm13 \n\t" // 2 * y + "vmulpd 64(%3,%0,8), %%ymm14, %%ymm14 \n\t" // 2 * y + "vmulpd 96(%3,%0,8), %%ymm15, %%ymm15 \n\t" // 2 * y + + "vaddpd %%ymm4 , %%ymm12, %%ymm4 \n\t" // 2 * y + "vaddpd %%ymm5 , %%ymm13, %%ymm5 \n\t" // 2 * y + "vaddpd %%ymm6 , %%ymm14, %%ymm6 \n\t" // 2 * y + "vaddpd %%ymm7 , %%ymm15, %%ymm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + "jnz 1b \n\t" + + "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" + + "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c new file mode 100644 index 000000000..5ce20b5de --- /dev/null +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x + "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x + "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x + "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x + "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + + "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y + "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y + "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y + "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y + + "addq $16 , %0 \n\t" + "subq $16 , %1 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S new file mode 100644 index 000000000..c84b599ce --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -0,0 +1,4753 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 27df12bef..62016fc0b 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -37,48 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 - -#ifndef HAVE_KERNEL_4x8 - -static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT *b0,*b1,*b2,*b3; - FLOAT *x4; - FLOAT x[8]; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x4 = x + 4; - - for ( i=0; i<8; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - - y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; - y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; - y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; - y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; - - } -} - -#endif - - #ifndef HAVE_KERNEL_4x4 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) @@ -257,7 +215,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda8 = lda << 3; FLOAT xbuffer[8],*ybuffer; if ( m < 1 ) return(0); @@ -265,23 +222,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - if ( inc_x == 1 ) - { - n1 = n >> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; + n1 = n >> 2 ; + n2 = n & 3 ; - } - m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; - y_ptr = y; BLASLONG NB = NBMAX; @@ -314,22 +261,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) { dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; a_ptr += lda4; x_ptr += 4; } diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index e1587b57c..b9f64407a 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -27,128 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %8 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" - "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "addq $8 , %8 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -159,68 +37,59 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT __asm__ __volatile__ ( - "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + "vmovups (%6,%0,8), %%ymm2 \n\t" + "vmovups (%7,%0,8), %%ymm3 \n\t" "vbroadcastsd (%8), %%ymm6 \n\t" // alpha - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" + "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" + "vmovups (%6,%0,8), %%ymm2 \n\t" + "vmovups (%7,%0,8), %%ymm3 \n\t" + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y - "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" "jnz 1b \n\t" + + + "2: \n\t" + + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" + "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" + + + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + - "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c index 0d2c24d52..d8c29831a 100644 --- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -27,150 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "movsd (%2), %%xmm12 \n\t" // x0 - "movsd 8(%2), %%xmm13 \n\t" // x1 - "movsd 16(%2), %%xmm14 \n\t" // x2 - "movsd 24(%2), %%xmm15 \n\t" // x3 - "shufpd $0, %%xmm12, %%xmm12\n\t" - "shufpd $0, %%xmm13, %%xmm13\n\t" - "shufpd $0, %%xmm14, %%xmm14\n\t" - "shufpd $0, %%xmm15, %%xmm15\n\t" - - "movsd 32(%2), %%xmm0 \n\t" // x4 - "movsd 40(%2), %%xmm1 \n\t" // x5 - "movsd 48(%2), %%xmm2 \n\t" // x6 - "movsd 56(%2), %%xmm3 \n\t" // x7 - "shufpd $0, %%xmm0 , %%xmm0 \n\t" - "shufpd $0, %%xmm1 , %%xmm1 \n\t" - "shufpd $0, %%xmm2 , %%xmm2 \n\t" - "shufpd $0, %%xmm3 , %%xmm3 \n\t" - - "movsd (%9), %%xmm6 \n\t" // alpha - "shufpd $0, %%xmm6 , %%xmm6 \n\t" - - - ".align 16 \n\t" - "1: \n\t" - "xorpd %%xmm4 , %%xmm4 \n\t" - "xorpd %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y - - ".align 2 \n\t" - "movups (%4,%0,8), %%xmm8 \n\t" - "movups (%5,%0,8), %%xmm9 \n\t" - "movups (%6,%0,8), %%xmm10 \n\t" - "movups (%7,%0,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "movups (%4,%8,8), %%xmm8 \n\t" - "movups (%5,%8,8), %%xmm9 \n\t" - "movups (%6,%8,8), %%xmm10 \n\t" - "movups (%7,%8,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm0 , %%xmm8 \n\t" - "mulpd %%xmm1 , %%xmm9 \n\t" - "mulpd %%xmm2 , %%xmm10 \n\t" - "mulpd %%xmm3 , %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "addpd %%xmm5 , %%xmm4 \n\t" - "mulpd %%xmm6 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm7 \n\t" - - "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y - - "xorpd %%xmm4 , %%xmm4 \n\t" - "xorpd %%xmm5 , %%xmm5 \n\t" - "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y - - ".align 2 \n\t" - "movups 16(%4,%0,8), %%xmm8 \n\t" - "movups 16(%5,%0,8), %%xmm9 \n\t" - "movups 16(%6,%0,8), %%xmm10 \n\t" - "movups 16(%7,%0,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "movups 16(%4,%8,8), %%xmm8 \n\t" - "movups 16(%5,%8,8), %%xmm9 \n\t" - "movups 16(%6,%8,8), %%xmm10 \n\t" - "movups 16(%7,%8,8), %%xmm11 \n\t" - ".align 2 \n\t" - "mulpd %%xmm0 , %%xmm8 \n\t" - "mulpd %%xmm1 , %%xmm9 \n\t" - "mulpd %%xmm2 , %%xmm10 \n\t" - "mulpd %%xmm3 , %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm5 \n\t" - "addpd %%xmm10, %%xmm4 \n\t" - "addpd %%xmm11, %%xmm5 \n\t" - - "addq $4 , %8 \n\t" - "addpd %%xmm5 , %%xmm4 \n\t" - "mulpd %%xmm6 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm7 \n\t" - - "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - "jnz 1b \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -193,54 +49,105 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "movsd (%8), %%xmm6 \n\t" // alpha "shufpd $0, %%xmm6 , %%xmm6 \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" + "movups 16(%4,%0,8), %%xmm0 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups 16(%5,%0,8), %%xmm1 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups 16(%6,%0,8), %%xmm2 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + "movups 16(%7,%0,8), %%xmm3 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jz 2f \n\t" + ".align 16 \n\t" "1: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y + + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm12, %%xmm0 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm0 , %%xmm5 \n\t" "movups (%4,%0,8), %%xmm8 \n\t" + "movups 16(%4,%0,8), %%xmm0 \n\t" + + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm13, %%xmm1 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm1 , %%xmm5 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups 16(%5,%0,8), %%xmm1 \n\t" + + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm14, %%xmm2 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm2 , %%xmm5 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups 16(%6,%0,8), %%xmm2 \n\t" + + "mulpd %%xmm15, %%xmm11 \n\t" + "mulpd %%xmm15, %%xmm3 \n\t" + "addpd %%xmm11 , %%xmm4 \n\t" + "addpd %%xmm3 , %%xmm5 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm4 \n\t" - "addpd %%xmm10 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm11 \n\t" + "movups 16(%7,%0,8), %%xmm3 \n\t" - "mulpd %%xmm6 , %%xmm11 \n\t" - "addpd %%xmm7 , %%xmm11 \n\t" - "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y - "xorpd %%xmm4 , %%xmm4 \n\t" - "xorpd %%xmm5 , %%xmm5 \n\t" - "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm7 , %%xmm4 \n\t" + "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y + "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y - "movups 16(%4,%0,8), %%xmm8 \n\t" - "movups 16(%5,%0,8), %%xmm9 \n\t" - "movups 16(%6,%0,8), %%xmm10 \n\t" - "movups 16(%7,%0,8), %%xmm11 \n\t" - "mulpd %%xmm12, %%xmm8 \n\t" - "mulpd %%xmm13, %%xmm9 \n\t" - "mulpd %%xmm14, %%xmm10 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" - "addpd %%xmm8 , %%xmm4 \n\t" - "addpd %%xmm9 , %%xmm4 \n\t" - "addpd %%xmm10 , %%xmm4 \n\t" - "addpd %%xmm4 , %%xmm11 \n\t" - - "mulpd %%xmm6 , %%xmm11 \n\t" - "addpd %%xmm7 , %%xmm11 \n\t" - "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm5 \n\t" + "addpd %%xmm7 , %%xmm5 \n\t" + "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" + "2: \n\t" + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + + "mulpd %%xmm12, %%xmm8 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm11 , %%xmm4 \n\t" + + "mulpd %%xmm12, %%xmm0 \n\t" + "addpd %%xmm0 , %%xmm5 \n\t" + "mulpd %%xmm13, %%xmm1 \n\t" + "addpd %%xmm1 , %%xmm5 \n\t" + "mulpd %%xmm14, %%xmm2 \n\t" + "addpd %%xmm2 , %%xmm5 \n\t" + "mulpd %%xmm15, %%xmm3 \n\t" + "addpd %%xmm3 , %%xmm5 \n\t" + + "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm7 , %%xmm4 \n\t" + "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y + + "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y + "mulpd %%xmm6 , %%xmm5 \n\t" + "addpd %%xmm7 , %%xmm5 \n\t" + "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y + : : "r" (i), // 0 @@ -253,8 +160,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 5d85ecab7..7c550a759 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); xbuffer = buffer; - ytemp = buffer + NBMAX; + ytemp = buffer + (m < NBMAX ? m : NBMAX); n0 = n / NBMAX; n1 = (n % NBMAX) >> 2 ; diff --git a/kernel/x86_64/dger.c b/kernel/x86_64/dger.c new file mode 100644 index 000000000..157a8ea7f --- /dev/null +++ b/kernel/x86_64/dger.c @@ -0,0 +1,84 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if defined(SANDYBRIDGE) +#include "dger_microk_sandy-2.c" +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + BLASLONG m1 = m & -16; + + while (n > 0) + { + FLOAT y0 = alpha * *y; + if ( m1 > 0 ) + { + #ifdef HAVE_KERNEL_16 + dger_kernel_16(m1, X, a, &y0); + #else + AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); + #endif + } + + if ( m > m1 ) + { + AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); + } + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c new file mode 100644 index 000000000..564f1356d --- /dev/null +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" + "vmovups 16(%3,%0,8), %%xmm9 \n\t" + "vmovups 32(%3,%0,8), %%xmm10 \n\t" + "vmovups 48(%3,%0,8), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm4 \n\t" + "vmovups 16(%2,%0,8), %%xmm5 \n\t" + "vmovups 32(%2,%0,8), %%xmm6 \n\t" + "vmovups 48(%2,%0,8), %%xmm7 \n\t" + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + "jz 2f \n\t" + + ".align 8 \n\t" + "1: \n\t" + + "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" + "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" + "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" + "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" + "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" + "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" + + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm8 \n\t" + "vmovups 16(%3,%0,8), %%xmm9 \n\t" + "vmovups 32(%3,%0,8), %%xmm10 \n\t" + "vmovups 48(%3,%0,8), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm4 \n\t" + "vmovups 16(%2,%0,8), %%xmm5 \n\t" + "vmovups 32(%2,%0,8), %%xmm6 \n\t" + "vmovups 48(%2,%0,8), %%xmm7 \n\t" + + "vmovups %%xmm12, -64(%3,%0,8) \n\t" + "vmovups %%xmm13, -48(%3,%0,8) \n\t" + "vmovups %%xmm14, -32(%3,%0,8) \n\t" + "vmovups %%xmm15, -16(%3,%0,8) \n\t" + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" + "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" + "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" + "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" + + "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" + "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" + + "vmovups %%xmm12, -64(%3,%0,8) \n\t" + "vmovups %%xmm13, -48(%3,%0,8) \n\t" + "vmovups %%xmm14, -32(%3,%0,8) \n\t" + "vmovups %%xmm15, -16(%3,%0,8) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c new file mode 100644 index 000000000..e3e2b0d58 --- /dev/null +++ b/kernel/x86_64/dscal.c @@ -0,0 +1,238 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "dscal_microk_bulldozer-2.c" +#elif defined(SANDYBRIDGE) +#include "dscal_microk_sandy-2.c" +#elif defined(HASWELL) +#include "dscal_microk_haswell-2.c" +#endif + + +#if !defined(HAVE_KERNEL_8) + +static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + dscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + + while(j < n) + { + + x[i] *= da; + i += inc_x ; + j++; + + } + + } + + return(0); + } + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + if ( da == 0.0 ) + dscal_kernel_8_zero(n1 , &da , x); + else + dscal_kernel_8(n1 , &da , x); + } + + if ( da == 0.0 ) + { + for ( i=n1 ; i> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 256(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 320(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c new file mode 100644 index 000000000..07a9c804c --- /dev/null +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + // "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + // "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c new file mode 100644 index 000000000..f5bf5932f --- /dev/null +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index f6157f791..3f5e77e5f 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,6 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_L_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "dsymv_L_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) #include "dsymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c new file mode 100644 index 000000000..bc5ec6b87 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + "addq $4 , %0 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c new file mode 100644 index 000000000..c87084915 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t" + "vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t" + + "vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t" + "vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t" + + "vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t" + "vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t" + + "vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t" + + "addq $4 , %0 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index ecfaf5043..9f5ae3015 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,6 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_U_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "dsymv_U_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "dsymv_U_microk_sandy-2.c" #elif defined(NEHALEM) #include "dsymv_U_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c new file mode 100644 index 000000000..6ce384f93 --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vfmadd231pd %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231pd %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231pd %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c new file mode 100644 index 000000000..212d4cf7b --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorpd %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorpd %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorpd %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastsd (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastsd 8(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,8), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,8), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,8), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,8), %%ymm15 \n\t" // 2 * a + + "vmulpd %%ymm4, %%ymm12, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm12, %%ymm11 \n\t" + "vaddpd %%ymm0, %%ymm11, %%ymm0 \n\t" + + "vmulpd %%ymm5, %%ymm13, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm13, %%ymm11 \n\t" + "vaddpd %%ymm1, %%ymm11, %%ymm1 \n\t" + + "vmulpd %%ymm6, %%ymm14, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm14, %%ymm11 \n\t" + "vaddpd %%ymm2, %%ymm11, %%ymm2 \n\t" + + "vmulpd %%ymm7, %%ymm15, %%ymm10 \n\t" + "vaddpd %%ymm9, %%ymm10, %%ymm9 \n\t" + "vmulpd %%ymm8, %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm3, %%ymm11, %%ymm3 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "vmovups %%ymm9 , -32(%3,%0,8) \n\t" + + "jnz 1b \n\t" + + "vmovsd (%9), %%xmm4 \n\t" + "vmovsd 8(%9), %%xmm5 \n\t" + "vmovsd 16(%9), %%xmm6 \n\t" + "vmovsd 24(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddpd %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddpd %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddpd %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovsd %%xmm0 , (%9) \n\t" // save temp2 + "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2 + "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2 + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c new file mode 100644 index 000000000..ac8c97d03 --- /dev/null +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -0,0 +1,1546 @@ +#include "common.h" +#include + + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline)); + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) +{ + + BLASLONG i = 0; + BLASLONG temp1 = n * 8; + + __asm__ __volatile__ + ( + " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t" + " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t" + " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t" + " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t" + " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" + " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" + " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + + " cmp $0, %1 \n\t" + " jz 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + " vmovups (%2,%0,4) , %%ymm0 \n\t" + " vmovups (%3,%0,8) , %%ymm1 \n\t" + " vmovups 32(%3,%0,8) , %%ymm2 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" + + " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" + + " addq $8 , %0 \n\t" + " cmp %0 , %1 \n\t" + " jne 1b \n\t" + + "2: \n\t" + + " vbroadcastsd (%4), %%ymm0 \n\t" + + " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t" + " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t" + " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t" + " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t" + " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t" + " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t" + " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" + " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" + + " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" + " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + + " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%5) \n\t" + " vmovups %%ymm5 , (%6) \n\t" + " vmovups %%ymm6 , (%7) \n\t" + " vmovups %%ymm7 , (%8) \n\t" + + " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" + " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + + " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%9) \n\t" + " vmovups %%ymm5 , (%10) \n\t" + " vmovups %%ymm6 , (%11) \n\t" + " vmovups %%ymm7 , (%12) \n\t" + + : + : + "a" (i), // 0 + "r" (temp1), // 1 + "S" (a), // 2 + "D" (b), // 3 + "r" (alpha), // 4 + "r" (C0), // 5 + "r" (C1), // 6 + "r" (C2), // 7 + "r" (C3), // 8 + "r" (C4), // 9 + "r" (C5), // 10 + "r" (C6), // 11 + "r" (C7) // 12 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp ; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j +#include "common.h" + +#if defined(SANDYBRIDGE) +#include "sger_microk_sandy-2.c" +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + BLASLONG m1 = m & -16; + + while (n > 0) + { + FLOAT y0 = alpha * *y; + if ( m1 > 0 ) + { + #ifdef HAVE_KERNEL_16 + sger_kernel_16(m1, X, a, &y0); + #else + AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); + #endif + } + + if ( m > m1 ) + { + AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); + } + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c new file mode 100644 index 000000000..51c3bef3e --- /dev/null +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%xmm0 \n\t" // alpha + "prefetcht0 256(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm8 \n\t" + "vmovups 16(%3,%0,4), %%xmm9 \n\t" + "vmovups 32(%3,%0,4), %%xmm10 \n\t" + "vmovups 48(%3,%0,4), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm4 \n\t" + "vmovups 16(%2,%0,4), %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm6 \n\t" + "vmovups 48(%2,%0,4), %%xmm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" + "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" + "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" + "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" + "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" + "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" + + "prefetcht0 256(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%xmm8 \n\t" + "vmovups 16(%3,%0,4), %%xmm9 \n\t" + "vmovups 32(%3,%0,4), %%xmm10 \n\t" + "vmovups 48(%3,%0,4), %%xmm11 \n\t" + + "prefetcht0 256(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm4 \n\t" + "vmovups 16(%2,%0,4), %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm6 \n\t" + "vmovups 48(%2,%0,4), %%xmm7 \n\t" + + "vmovups %%xmm12, -64(%3,%0,4) \n\t" + "vmovups %%xmm13, -48(%3,%0,4) \n\t" + "vmovups %%xmm14, -32(%3,%0,4) \n\t" + "vmovups %%xmm15, -16(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" + "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" + "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" + "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" + + "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" + "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" + "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" + "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" + + "vmovups %%xmm12, -64(%3,%0,4) \n\t" + "vmovups %%xmm13, -48(%3,%0,4) \n\t" + "vmovups %%xmm14, -32(%3,%0,4) \n\t" + "vmovups %%xmm15, -16(%3,%0,4) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index a2b716b58..0997f108d 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,6 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ssymv_L_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ssymv_L_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c new file mode 100644 index 000000000..516524528 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] + "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a + + "vfmadd231ps %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "vfmadd231ps %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a + "vfmadd231ps %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c new file mode 100644 index 000000000..07293a964 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + if ( ( to - from ) & 4 ) + { + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] + "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a + + "vmulps %%xmm4, %%xmm12 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm12 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm0, %%xmm11 , %%xmm0 \n\t" + + "vmulps %%xmm5, %%xmm13 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm13 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm1, %%xmm11 , %%xmm1 \n\t" + + "vmulps %%xmm6, %%xmm14 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm14 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm2, %%xmm11 , %%xmm2 \n\t" + + "vmulps %%xmm7, %%xmm15 , %%xmm10 \n\t" + "vmulps %%xmm8, %%xmm15 , %%xmm11 \n\t" + "vaddps %%xmm9, %%xmm10 , %%xmm9 \n\t" + "vaddps %%xmm3, %%xmm11 , %%xmm3 \n\t" + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vmulps %%ymm4, %%ymm12 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm12 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm0, %%ymm11 , %%ymm0 \n\t" + + "vmulps %%ymm5, %%ymm13 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm13 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm1, %%ymm11 , %%ymm1 \n\t" + + "vmulps %%ymm6, %%ymm14 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm14 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm2, %%ymm11 , %%ymm2 \n\t" + + "vmulps %%ymm7, %%ymm15 , %%ymm10 \n\t" + "vmulps %%ymm8, %%ymm15 , %%ymm11 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vaddps %%ymm3, %%ymm11 , %%ymm3 \n\t" + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + + "addq $8 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + +} + + diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 0aadd3fd2..ed1e8236c 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,6 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" +#elif defined(HASWELL) +#include "ssymv_U_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "ssymv_U_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c new file mode 100644 index 000000000..42f801c96 --- /dev/null +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vfmadd231ps %%ymm4, %%ymm12 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm12 , %%ymm0 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm5, %%ymm13 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm13 , %%ymm1 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm6, %%ymm14 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm14 , %%ymm2 \n\t" // temp2 += x * a + + "vfmadd231ps %%ymm7, %%ymm15 , %%ymm9 \n\t" // y += temp1 * a + "vfmadd231ps %%ymm8, %%ymm15 , %%ymm3 \n\t" // temp2 += x * a + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c new file mode 100644 index 000000000..4b699af50 --- /dev/null +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0 , %%ymm0 \n\t" // temp2[0] + "vxorps %%ymm1 , %%ymm1 , %%ymm1 \n\t" // temp2[1] + "vxorps %%ymm2 , %%ymm2 , %%ymm2 \n\t" // temp2[2] + "vxorps %%ymm3 , %%ymm3 , %%ymm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%ymm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%ymm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1] + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y + "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x + + "vmovups (%4,%0,4), %%ymm12 \n\t" // 2 * a + "vmovups (%5,%0,4), %%ymm13 \n\t" // 2 * a + "vmovups (%6,%0,4), %%ymm14 \n\t" // 2 * a + "vmovups (%7,%0,4), %%ymm15 \n\t" // 2 * a + + "vmulps %%ymm4, %%ymm12 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm12 , %%ymm11 \n\t" + "vaddps %%ymm0, %%ymm11 , %%ymm0 \n\t" + + "vmulps %%ymm5, %%ymm13 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm13 , %%ymm11 \n\t" + "vaddps %%ymm1, %%ymm11 , %%ymm1 \n\t" + + "vmulps %%ymm6, %%ymm14 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm14 , %%ymm11 \n\t" + "vaddps %%ymm2, %%ymm11 , %%ymm2 \n\t" + + "vmulps %%ymm7, %%ymm15 , %%ymm10 \n\t" + "vaddps %%ymm9, %%ymm10 , %%ymm9 \n\t" + "vmulps %%ymm8, %%ymm15 , %%ymm11 \n\t" + "vaddps %%ymm3, %%ymm11 , %%ymm3 \n\t" + + "vmovups %%ymm9 , (%3,%0,4) \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + "jnz 1b \n\t" + + "vmovss (%9), %%xmm4 \n\t" + "vmovss 4(%9), %%xmm5 \n\t" + "vmovss 8(%9), %%xmm6 \n\t" + "vmovss 12(%9), %%xmm7 \n\t" + + "vextractf128 $0x01, %%ymm0 , %%xmm12 \n\t" + "vextractf128 $0x01, %%ymm1 , %%xmm13 \n\t" + "vextractf128 $0x01, %%ymm2 , %%xmm14 \n\t" + "vextractf128 $0x01, %%ymm3 , %%xmm15 \n\t" + + "vaddps %%xmm0, %%xmm12, %%xmm0 \n\t" + "vaddps %%xmm1, %%xmm13, %%xmm1 \n\t" + "vaddps %%xmm2, %%xmm14, %%xmm2 \n\t" + "vaddps %%xmm3, %%xmm15, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t" + "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t" + "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 8 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 52a25c793..560acc7f9 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -29,8 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) #include "zaxpy_microk_bulldozer-2.c" +#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zaxpy_microk_steamroller-2.c" +#elif defined(HASWELL) +#include "zaxpy_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zaxpy_microk_sandy-2.c" #endif @@ -78,13 +84,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; + BLASLONG n1 = n & -16; if ( n1 ) { da[0] = da_r; da[1] = da_i; - zaxpy_kernel_4(n1, x, y , &da ); + zaxpy_kernel_4(n1, x, y , da ); ix = 2 * n1; } i = n1; diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index f9732cd4e..0e15761f7 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -31,89 +31,87 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif BLASLONG register i = 0; + if ( n < 384 ) + { + __asm__ __volatile__ ( + "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif ".align 16 \n\t" "1: \n\t" - "prefetcht0 768(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x - "prefetcht0 768(%3,%0,8) \n\t" -#if !defined(CONJ) - "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 1 complex values from x + "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 1 complex values from x + "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 1 complex values from x + "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 1 complex values from x + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" - - "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" - - "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" - - "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part - "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" - "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" - "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" - "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" - "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" -#else + "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" - "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i - "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i - "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part - "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part - "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" - "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vfmaddpd 64(%3,%0,8), %%xmm0 , %%xmm12, %%xmm12 \n\t" + "vfmaddpd 80(%3,%0,8), %%xmm0 , %%xmm13, %%xmm13 \n\t" + "vfmaddpd 96(%3,%0,8), %%xmm0 , %%xmm14, %%xmm14 \n\t" + "vfmaddpd 112(%3,%0,8), %%xmm0 , %%xmm15, %%xmm15 \n\t" - "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part - "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" - "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmaddpd %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmaddpd %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmaddpd %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part - "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" - "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" - "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part - - "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" - "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" - "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" - "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" - - -#endif - - "vmovups %%xmm12, (%3,%0,8) \n\t" - "vmovups %%xmm13, 16(%3,%0,8) \n\t" - "vmovups %%xmm14, 32(%3,%0,8) \n\t" - "vmovups %%xmm15, 48(%3,%0,8) \n\t" - - "addq $8 , %0 \n\t" - "subq $4 , %1 \n\t" + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" "jnz 1b \n\t" + "vzeroupper \n\t" : : @@ -121,7 +119,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (alpha) // 4 + "r" (alpha), // 4 + "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", @@ -129,7 +128,73 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x + + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,8) \n\t" + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" + ".align 2 \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" + + "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + } - diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c new file mode 100644 index 000000000..e7e559502 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%ymm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 2 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 complex values from x + + "vmovups 128(%2,%0,8), %%ymm12 \n\t" // 2 complex values from x + "vmovups 160(%2,%0,8), %%ymm13 \n\t" // 2 complex values from x + "vmovups 192(%2,%0,8), %%ymm14 \n\t" // 2 complex values from x + "vmovups 224(%2,%0,8), %%ymm15 \n\t" // 2 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213pd (%3,%0,8), %%ymm0 , %%ymm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 32(%3,%0,8), %%ymm0 , %%ymm7 \n\t" + "vfmadd213pd 64(%3,%0,8), %%ymm0 , %%ymm9 \n\t" + "vfmadd213pd 96(%3,%0,8), %%ymm0 , %%ymm11 \n\t" + + "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm5 \n\t" + "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm7 \n\t" + "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm9 \n\t" + "vfmadd231pd %%ymm1 , %%ymm10, %%ymm11 \n\t" + + "vpermilpd $0x5 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part + + "vfmadd213pd 128(%3,%0,8), %%ymm0 , %%ymm12 \n\t" + "vfmadd213pd 160(%3,%0,8), %%ymm0 , %%ymm13 \n\t" + "vfmadd213pd 192(%3,%0,8), %%ymm0 , %%ymm14 \n\t" + "vfmadd213pd 224(%3,%0,8), %%ymm0 , %%ymm15 \n\t" + + "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm12 \n\t" + "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm13 \n\t" + "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm14 \n\t" + "vfmadd231pd %%ymm1 , %%ymm10, %%ymm15 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "vmovups %%ymm12,128(%3,%0,8) \n\t" + "vmovups %%ymm13,160(%3,%0,8) \n\t" + "vmovups %%ymm14,192(%3,%0,8) \n\t" + "vmovups %%ymm15,224(%3,%0,8) \n\t" + + "addq $32, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c new file mode 100644 index 000000000..8b0a7ed05 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n < 1280 ) + { + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" + "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" + "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" + "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" + + "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha + "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" +#else + "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,8) \n\t" + "prefetcht0 576(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x + "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x + + "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part + "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part + + "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" + "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" + + "prefetcht0 512(%3,%0,8) \n\t" + "prefetcht0 576(%3,%0,8) \n\t" + "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" + "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" + "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" + "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" + + "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" + "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" + "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" + "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" + "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%ymm7 , 32(%3,%0,8) \n\t" + "vmovups %%ymm9 , 64(%3,%0,8) \n\t" + "vmovups %%ymm11, 96(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + + + +} + diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c new file mode 100644 index 000000000..728d09213 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x + + "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 2 complex values from x + "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 2 complex values from x + "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 2 complex values from x + "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 2 complex values from x + + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" + "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" + "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part + + "vfmadd213pd 64(%3,%0,8), %%xmm0 , %%xmm12 \n\t" + "vfmadd213pd 80(%3,%0,8), %%xmm0 , %%xmm13 \n\t" + "vfmadd213pd 96(%3,%0,8), %%xmm0 , %%xmm14 \n\t" + "vfmadd213pd 112(%3,%0,8), %%xmm0 , %%xmm15 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm12 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm13 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm14 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm15 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + "vmovups %%xmm12, 64(%3,%0,8) \n\t" + "vmovups %%xmm13, 80(%3,%0,8) \n\t" + "vmovups %%xmm14, 96(%3,%0,8) \n\t" + "vmovups %%xmm15,112(%3,%0,8) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha +#if !defined(CONJ) + "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" +#else + "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" +#endif + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x + ".align 2 \n\t" + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x + + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + + "prefetcht0 512(%3,%0,8) \n\t" + "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" + ".align 2 \n\t" + "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" + "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" + "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" + + "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" + "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" + "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" + "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" + + "vmovups %%xmm5 , (%3,%0,8) \n\t" + ".align 2 \n\t" + "vmovups %%xmm7 , 16(%3,%0,8) \n\t" + "vmovups %%xmm9 , 32(%3,%0,8) \n\t" + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + "r" (mvec) // 5 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "memory" + ); + + +} + diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c new file mode 100644 index 000000000..eee00fd9f --- /dev/null +++ b/kernel/x86_64/zdot.c @@ -0,0 +1,166 @@ +/*************************************************************************** +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include + + +#if defined(BULLDOZER) +#include "zdot_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#include "zdot_microk_steamroller-2.c" +#elif defined(HASWELL) +#include "zdot_microk_haswell-2.c" +#elif defined(SANDYBRIDGE) +#include "zdot_microk_sandy-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + dot[0] += x[j+2] * y[j+2] ; + dot[1] += x[j+3] * y[j+3] ; + dot[2] += x[j+2] * y[j+3] ; + dot[3] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[2] += x[j+4] * y[j+5] ; + dot[3] += x[j+5] * y[j+4] ; + + dot[0] += x[j+6] * y[j+6] ; + dot[1] += x[j+7] * y[j+7] ; + dot[2] += x[j+6] * y[j+7] ; + dot[3] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + + if ( n1 ) + zdot_kernel_8(n1, x, y , dot ); + + i = n1; + BLASLONG j = i * 2; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[2] += x[ix] * y[iy+1] ; + dot[3] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[2] + dot[3]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[2] - dot[3]; + +#endif + + return(result); + +} + + diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c new file mode 100644 index 000000000..30a9552d6 --- /dev/null +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 768 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 384(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + + "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c new file mode 100644 index 000000000..810cb4439 --- /dev/null +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n <=1280 ) + { + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "prefetcht0 576(%2,%0,8) \n\t" + "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x + + "prefetcht0 576(%3,%0,8) \n\t" + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + + "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i + "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" + "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r + "addq $16 , %0 \n\t" + "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r + "subq $8 , %1 \n\t" + "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + + +} + + diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c new file mode 100644 index 000000000..fd06612e6 --- /dev/null +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + +if ( n < 1280 ) +{ + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + return; + + } + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y + "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y + + "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y + "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y + + "prefetcht0 576(%3,%0,8) \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "prefetcht0 576(%2,%0,8) \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" + "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" + "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" + "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x + "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x + "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" + "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" + + + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" + "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" + "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" + "addq $16 , %0 \n\t" + "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" + "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" + + "jnz 1b \n\t" + + "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" + "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" + "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" + + "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" + "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" + "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" + + "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" + "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c new file mode 100644 index 000000000..325f74ae3 --- /dev/null +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -0,0 +1,193 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + if ( n < 640 ) + { + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + //"prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + // "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + return; + } + + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x + "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x + + "prefetcht0 512(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y + "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y + + "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x + + "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y + "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i + "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + + "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r + "addq $8 , %0 \n\t" + "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r + "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r + "subq $4 , %1 \n\t" + "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r + + "jnz 1b \n\t" + + "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" + "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" + "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vmovups %%xmm0, (%4) \n\t" + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + + diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c new file mode 100644 index 000000000..a96766032 --- /dev/null +++ b/kernel/x86_64/zscal.c @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(HASWELL) +#include "zscal_microk_haswell-2.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) +#include "zscal_microk_bulldozer-2.c" +#elif defined(STEAMROLLER) +#include "zscal_microk_steamroller-2.c" +#endif + + +#if !defined(HAVE_KERNEL_8) + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) +{ + + BLASLONG i; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + FLOAT t0,t1,t2,t3; + + for( i=0; i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1 ; + i = n1 * inc_x; + } + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += inc_x ; + j++; + + } + + } + + } + + return(0); + } + + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + + alpha[0] = da_r; + alpha[1] = da_i; + + if ( da_r == 0.0 ) + if ( da_i == 0 ) + zscal_kernel_8_zero(n1 , alpha , x); + else + zscal_kernel_8_zero_r(n1 , alpha , x); + else + if ( da_i == 0 ) + zscal_kernel_8_zero_i(n1 , alpha , x); + else + zscal_kernel_8(n1 , alpha , x); + + i = n1 << 1; + j = n1; + } + + + if ( da_r == 0.0 ) + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + x[i+1]=0.0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = -da_i * x[i+1]; + x[i+1] = da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + else + { + + if ( da_i == 0.0 ) + { + + while(j < n) + { + + temp0 = da_r * x[i]; + x[i+1] = da_r * x[i+1]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + else + { + + while(j < n) + { + + temp0 = da_r * x[i] - da_i * x[i+1]; + x[i+1] = da_r * x[i+1] + da_i * x[i]; + x[i] = temp0; + i += 2 ; + j++; + + } + + } + + } + + return(0); +} + + diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c new file mode 100644 index 000000000..28fe73480 --- /dev/null +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // da_r + "vmovddup 8(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 192(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + "vmovddup 8(%2), %%xmm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups -64(%1), %%xmm4 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" + "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" + "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" + "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%xmm4 \n\t" + "vmovups -112(%1), %%xmm5 \n\t" + "vmovups -96(%1), %%xmm6 \n\t" + "vmovups -80(%1), %%xmm7 \n\t" + + + "subq $4 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups -64(%1), %%xmm4 \n\t" + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmovups -48(%1), %%xmm5 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmovups -32(%1), %%xmm6 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + "vmovups -16(%1), %%xmm7 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" + "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" + "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" + + "vmovups %%xmm8 , -128(%1) \n\t" + "vmovups %%xmm9 , -112(%1) \n\t" + "vmovups %%xmm10, -96(%1) \n\t" + "vmovups %%xmm11, -80(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%xmm0 , -128(%1) \n\t" + "vmovups %%xmm0 , -112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "addq $64 ,%1 \n\t" + "subq $4 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c new file mode 100644 index 000000000..a93308ec4 --- /dev/null +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -0,0 +1,348 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c new file mode 100644 index 000000000..d611bf570 --- /dev/null +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "prefetcht0 320(%1) \n\t" + "prefetcht0 384(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups 0(%1), %%ymm4 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 + "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm0 \n\t" // da_r + + "addq $128, %1 \n\t" + + "vmovups -128(%1), %%ymm4 \n\t" + "vmovups -96(%1), %%ymm5 \n\t" + "vmovups -64(%1), %%ymm6 \n\t" + "vmovups -32(%1), %%ymm7 \n\t" + + + "subq $8 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmovups 0(%1), %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmovups 32(%1), %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmovups 64(%1), %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + "vmovups 96(%1), %%ymm7 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + + "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 + "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" + + "vmovups %%ymm8 , -128(%1) \n\t" + "vmovups %%ymm9 , -96(%1) \n\t" + "vmovups %%ymm10, -64(%1) \n\t" + "vmovups %%ymm11, -32(%1) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + //"prefetcht0 128(%1) \n\t" + // ".align 2 \n\t" + + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128 ,%1 \n\t" + "subq $8 , %0 \n\t" + "jnz 1b \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (alpha) // 2 + : "cc", "%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 128d59ae4..ef6ae0ec1 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -169,4 +169,4 @@ cchkee.o: cchkee.f zchkee.o: zchkee.f $(FORTRAN) $(DRVOPTS) -c $< -o $@ -.f.o : ; $(FORTRAN) $(OPTS) -c $< -o $@ +.f.o : ; $(FORTRAN) $(DRVOPTS) -c $< -o $@ diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 2352da64c..44b05b794 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -338,4 +338,4 @@ zchkaa.o: zchkaa.f $(FORTRAN) $(DRVOPTS) -c $< -o $@ .f.o: - $(FORTRAN) $(OPTS) -c $< -o $@ + $(FORTRAN) $(DRVOPTS) -c $< -o $@ diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index 19bd7c3da..6ef4329c9 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -1,11 +1,11 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines 8 Number of values of N -0 1 2 3 5 19 20 21 Values of N (dimension) +0 1 2 3 5 18 19 21 Values of N (dimension) 5 Number of values of NB 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) -160.0 Threshold value +300.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits diff --git a/lapack-netlib/TESTING/zctest.in b/lapack-netlib/TESTING/zctest.in index ef88cc0d9..48e88ec50 100644 --- a/lapack-netlib/TESTING/zctest.in +++ b/lapack-netlib/TESTING/zctest.in @@ -1,6 +1,6 @@ Data file for testing ZCGESV/ZCPOSV LAPACK routines 11 Number of values of M -0 1 2 13 17 45 78 91 101 120 132 Values of M (row dimension) +0 1 2 13 17 45 78 91 101 121 132 Values of M (row dimension) 4 Number of values of NRHS 1 2 15 16 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio diff --git a/lapack-netlib/lapacke/src/lapacke_clansy.c b/lapack-netlib/lapacke/src/lapacke_clansy.c index 84a9d965a..eb9951145 100644 --- a/lapack-netlib/lapacke/src/lapacke_clansy.c +++ b/lapack-netlib/lapacke/src/lapacke_clansy.c @@ -51,7 +51,7 @@ float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_clansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_clantr.c b/lapack-netlib/lapacke/src/lapacke_clantr.c index 77743f2d5..00ba34273 100644 --- a/lapack-netlib/lapacke/src/lapacke_clantr.c +++ b/lapack-netlib/lapacke/src/lapacke_clantr.c @@ -53,7 +53,7 @@ float LAPACKE_clantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_clantr_work.c b/lapack-netlib/lapacke/src/lapacke_clantr_work.c index cb253a11e..1fa8cd923 100644 --- a/lapack-netlib/lapacke/src/lapacke_clantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_clantr_work.c @@ -47,7 +47,7 @@ float LAPACKE_clantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -57,13 +57,13 @@ float LAPACKE_clantr_work( int matrix_order, char norm, char uplo, } /* Allocate memory for temporary array(s) */ a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_ctr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c b/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c index 1cd20e1ca..5cf66424d 100644 --- a/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_cunmlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_cunmlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau, @@ -41,20 +41,22 @@ lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, lapack_complex_float* work, lapack_int lwork ) { lapack_int info = 0; - if( matrix_order == LAPACK_COL_MAJOR ) { + lapack_int r; + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_cunmlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lapack_int lda_t = MAX(1,k); lapack_int ldc_t = MAX(1,m); lapack_complex_float* a_t = NULL; lapack_complex_float* c_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_cunmlq_work", info ); return info; @@ -84,8 +86,8 @@ lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_cge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_cunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_dlansy.c b/lapack-netlib/lapacke/src/lapacke_dlansy.c index 5e6721ef8..3d9964202 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlansy.c +++ b/lapack-netlib/lapacke/src/lapacke_dlansy.c @@ -51,7 +51,7 @@ double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_dlansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_dlantr.c b/lapack-netlib/lapacke/src/lapacke_dlantr.c index 522122cb2..2cde1ebad 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlantr.c +++ b/lapack-netlib/lapacke/src/lapacke_dlantr.c @@ -53,7 +53,7 @@ double LAPACKE_dlantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_dlantr_work.c b/lapack-netlib/lapacke/src/lapacke_dlantr_work.c index 0a937bda9..44d638fa5 100644 --- a/lapack-netlib/lapacke/src/lapacke_dlantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_dlantr_work.c @@ -46,7 +46,7 @@ double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); double* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -55,13 +55,13 @@ double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo, return info; } /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_dtr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_dormlq_work.c b/lapack-netlib/lapacke/src/lapacke_dormlq_work.c index 9a7a997fe..99a7c3c71 100644 --- a/lapack-netlib/lapacke/src/lapacke_dormlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_dormlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,27 +33,29 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc, double* work, lapack_int lwork ) { lapack_int info = 0; + lapack_int r; lapack_int lda_t, ldc_t; double *a_t = NULL, *c_t = NULL; - if( matrix_order == LAPACK_COL_MAJOR ) { + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dormlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lda_t = MAX(1,k); ldc_t = MAX(1,m); /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_dormlq_work", info ); return info; @@ -81,8 +83,8 @@ lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_dge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_slansy.c b/lapack-netlib/lapacke/src/lapacke_slansy.c index 105ce4635..adad99b7d 100644 --- a/lapack-netlib/lapacke/src/lapacke_slansy.c +++ b/lapack-netlib/lapacke/src/lapacke_slansy.c @@ -51,7 +51,7 @@ float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_slansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_slantr.c b/lapack-netlib/lapacke/src/lapacke_slantr.c index d6a512027..80313d118 100644 --- a/lapack-netlib/lapacke/src/lapacke_slantr.c +++ b/lapack-netlib/lapacke/src/lapacke_slantr.c @@ -53,7 +53,7 @@ float LAPACKE_slantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_slantr_work.c b/lapack-netlib/lapacke/src/lapacke_slantr_work.c index 2389468d0..9032f7094 100644 --- a/lapack-netlib/lapacke/src/lapacke_slantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_slantr_work.c @@ -46,7 +46,7 @@ float LAPACKE_slantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); float* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -55,13 +55,13 @@ float LAPACKE_slantr_work( int matrix_order, char norm, char uplo, return info; } /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_str_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_str_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_sormlq_work.c b/lapack-netlib/lapacke/src/lapacke_sormlq_work.c index 7a7464d18..bbf55bd84 100644 --- a/lapack-netlib/lapacke/src/lapacke_sormlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_sormlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,27 +33,29 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work, lapack_int lwork ) { lapack_int info = 0; + lapack_int r; lapack_int lda_t, ldc_t; float *a_t = NULL, *c_t = NULL; - if( matrix_order == LAPACK_COL_MAJOR ) { + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_sormlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lda_t = MAX(1,k); ldc_t = MAX(1,m); /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_sormlq_work", info ); return info; @@ -81,8 +83,8 @@ lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_sge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_sormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack-netlib/lapacke/src/lapacke_zlansy.c b/lapack-netlib/lapacke/src/lapacke_zlansy.c index 891437846..460a51a85 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlansy.c +++ b/lapack-netlib/lapacke/src/lapacke_zlansy.c @@ -51,7 +51,7 @@ double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n, #endif /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; @@ -62,7 +62,7 @@ double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n, res = LAPACKE_zlansy_work( matrix_order, norm, uplo, n, a, lda, work ); /* Release memory and exit */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || - LAPACKE_lsame( norm, '0' ) ) { + LAPACKE_lsame( norm, 'O' ) ) { LAPACKE_free( work ); } exit_level_0: diff --git a/lapack-netlib/lapacke/src/lapacke_zlantr.c b/lapack-netlib/lapacke/src/lapacke_zlantr.c index 887bc2eea..001ce68f6 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlantr.c +++ b/lapack-netlib/lapacke/src/lapacke_zlantr.c @@ -53,7 +53,7 @@ double LAPACKE_zlantr( int matrix_order, char norm, char uplo, char diag, /* Allocate memory for working array(s) */ if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, '0' ) ) { - work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/lapacke/src/lapacke_zlantr_work.c b/lapack-netlib/lapacke/src/lapacke_zlantr_work.c index 65e741428..8700a6ee2 100644 --- a/lapack-netlib/lapacke/src/lapacke_zlantr_work.c +++ b/lapack-netlib/lapacke/src/lapacke_zlantr_work.c @@ -47,7 +47,7 @@ double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo, info = info - 1; } } else if( matrix_order == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,n); + lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; /* Check leading dimension(s) */ if( lda < n ) { @@ -57,13 +57,13 @@ double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo, } /* Allocate memory for temporary array(s) */ a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_order, uplo, diag, n, a, lda, a_t, lda_t ); + LAPACKE_ztr_trans( matrix_order, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); info = 0; /* LAPACK call is ok! */ diff --git a/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c b/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c index 8677ac0bc..38a2d947a 100644 --- a/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c +++ b/lapack-netlib/lapacke/src/lapacke_zunmlq_work.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2011, Intel Corp. + Copyright (c) 2014, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,7 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, +lapack_int LAPACKE_zunmlq_work( int matrix_layout, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau, @@ -41,20 +41,22 @@ lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, lapack_complex_double* work, lapack_int lwork ) { lapack_int info = 0; - if( matrix_order == LAPACK_COL_MAJOR ) { + lapack_int r; + if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zunmlq( &side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } - } else if( matrix_order == LAPACK_ROW_MAJOR ) { + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; lapack_int lda_t = MAX(1,k); lapack_int ldc_t = MAX(1,m); lapack_complex_double* a_t = NULL; lapack_complex_double* c_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < r ) { info = -8; LAPACKE_xerbla( "LAPACKE_zunmlq_work", info ); return info; @@ -84,8 +86,8 @@ lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_order, k, m, a, lda, a_t, lda_t ); - LAPACKE_zge_trans( matrix_order, m, n, c, ldc, c_t, ldc_t ); + LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_zunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, work, &lwork, &info ); diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt new file mode 100644 index 000000000..de42e1ab6 --- /dev/null +++ b/lapack/CMakeLists.txt @@ -0,0 +1,98 @@ + +include_directories(${CMAKE_SOURCE_DIR}) + + +set(LAPACK_SOURCES + getrf/getrf_single.c + potrf/potrf_U_single.c + potrf/potrf_L_single.c + lauum/lauum_U_single.c + lauum/lauum_L_single.c +) + +# add a 'z' to filename for complex version +set(LAPACK_MANGLED_SOURCES + getf2/getf2_k.c + lauu2/lauu2_U.c + lauu2/lauu2_L.c + potf2/potf2_U.c + potf2/potf2_L.c +) + +# sources that need TRANS set +# this has a 'z' version +set(TRANS_SOURCES + getrs/getrs_single.c +) + +# sources that need UNIT set +# these do NOT have a z version +set(UNIT_SOURCES + trtri/trtri_U_single.c + trtri/trtri_L_single.c +) + +# these have a 'z' version +set(UNIT_SOURCES2 + trti2/trti2_U.c + trti2/trti2_L.c +) + +GenerateNamedObjects("${LAPACK_SOURCES}") +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) + +# TODO: laswp needs arch specific code +GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus" false "" "" false 3) +GenerateNamedObjects("laswp/generic/laswp_k.c" "MINUS" "laswp_minus" false "" "" false 3) + +if (SMP) + + if (USE_OPENMP) + set(GETRF_SRC getrf/getrf_parallel_omp.c) + else () + set(GETRF_SRC getrf/getrf_parallel.c) + endif () + + # these do not have 'z' versions + set(PARALLEL_SOURCES + ${GETRF_SRC} + lauum/lauum_U_parallel.c + lauum/lauum_L_parallel.c + potrf/potrf_U_parallel.c + potrf/potrf_L_parallel.c + ) + + # this has a z version + list(APPEND TRANS_SOURCES + getrs/getrs_parallel.c + ) + + # these do NOT have a z version + list(APPEND UNIT_SOURCES + trtri/trtri_U_parallel.c + trtri/trtri_L_parallel.c + ) + + GenerateNamedObjects("${PARALLEL_SOURCES}") +endif () + +foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + foreach (trans_src ${TRANS_SOURCES}) + string(REGEX MATCH "[a-z]/([a-z]+_)([a-z]+)" op_name ${trans_src}) + string(REPLACE "/" "/z" ztrans_src ${trans_src}) + GenerateNamedObjects("${ztrans_src}" "TRANS=1" "${CMAKE_MATCH_1}N_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=2" "${CMAKE_MATCH_1}T_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=3" "${CMAKE_MATCH_1}R_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + GenerateNamedObjects("${ztrans_src}" "TRANS=4" "${CMAKE_MATCH_1}C_${CMAKE_MATCH_2}" false "" "" false ${float_type}) + endforeach () + else () + GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" false ${float_type}) + endif () +endforeach () + +GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) +GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) + +add_library(lapack OBJECT ${OPENBLAS_SRC}) + diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index a76be3ba7..8fdf76987 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,7 +67,7 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 -static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { +static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); double n = (double)(N - IS - BK); @@ -373,7 +373,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG num_cpu; +#ifdef _MSC_VER + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; +#else volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#endif #ifndef COMPLEX #ifdef XDOUBLE diff --git a/make.inc b/make.inc index 485cb7d48..1fc95b0c6 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX -DRVOPTS = $(OPTS) +DRVOPTS = $(NOOPT) ARCHFLAGS= -ru #RANLIB = ranlib diff --git a/openblas_config_template.h b/openblas_config_template.h index 3b3435b0e..942a8f547 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -59,7 +59,8 @@ typedef int blasint; extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) + (__GNUC__ >= 3 && !defined(__cplusplus)) || \ + _MSC_VER >= 1800) // Visual Studio 2013 supports complex #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include diff --git a/param.h b/param.h index 18c711eb3..6c9ca83da 100644 --- a/param.h +++ b/param.h @@ -499,6 +499,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef EXCAVATOR +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + #ifdef ATHLON #define SNUMOPT 4 @@ -1322,7 +1414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 diff --git a/symcopy.h b/symcopy.h index 48ccbd369..16172c046 100644 --- a/symcopy.h +++ b/symcopy.h @@ -43,7 +43,7 @@ #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) -static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -141,7 +141,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -232,7 +232,7 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -362,7 +362,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -486,7 +486,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -613,7 +613,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -735,7 +735,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -862,7 +862,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -984,7 +984,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } -static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1082,7 +1082,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1180,7 +1180,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1270,7 +1270,7 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1360,7 +1360,7 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1490,7 +1490,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1620,7 +1620,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; @@ -1744,7 +1744,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ } } -static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ +static __inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 000000000..cd4497117 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,38 @@ +include_directories(${CMAKE_SOURCE_DIR}) + +enable_language(Fortran) + +set(OpenBLAS_Tests + sblat1 sblat2 sblat3 + dblat1 dblat2 dblat3 + cblat1 cblat2 cblat3 + zblat1 zblat2 zblat3) + +foreach(test_bin ${OpenBLAS_Tests}) +add_executable(${test_bin} ${test_bin}.f) +target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}_static) +endforeach() + +# $1 exec, $2 input, $3 output_result +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh +"rm -f $3\n" +"$1 < $2\n" +"grep -q FATAL $3\n" +"if [ $? -eq 0 ]; then\n" +"echo Error\n" +"exit 1\n" +"else\n" +"exit 0\n" +"fi\n" +) + +set(float_types s d c z) +foreach(float_type ${float_types}) +string(TOUPPER ${float_type} float_type_upper) +add_test(NAME "${float_type}blas1" + COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") +add_test(NAME "${float_type}blas2" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) +add_test(NAME "${float_type}blas3" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +endforeach() \ No newline at end of file