diff --git a/.travis.yml b/.travis.yml index 3f917ce72..bde0e202d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -211,44 +211,48 @@ matrix: - &test-macos os: osx - osx_image: xcode10.1 + osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" - <<: *test-macos osx_image: xcode12 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update - - brew install gcc@10 # for gfortran + - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - - <<: *test-macos - osx_image: xcode10.0 - env: - - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" + # - <<: *test-macos + # osx_image: xcode10 + # env: + # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 53c1709a8..12730e0e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12) +set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7b994885a..be9a32a7c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -190,4 +190,7 @@ In chronological order: * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support * Danfeng Zhang - * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file + * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 + +* PingTouGe Semiconductor Co., Ltd. + * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 diff --git a/Changelog.txt b/Changelog.txt index edd3563ec..cbc7007ac 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.13 + 12-Dec-2020 + + common: + * Added a generic bfloat16 SBGEMV kernel + * Fixed a potentially severe memory leak after fork in OpenMP builds + that was introduced in 0.3.12 + * Added detection of the Fujitsu Fortran compiler + * Added detection of the (e)gfortran compiler on OpenBSD + * Added support for overriding the default name of the library independently + from symbol suffixing in the gmake builds (already supported in cmake) + +RISCV: + * Added a RISC V port optimized for C910V + +POWER: + * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N + * Improved DGEMM performance on POWER10 + * Improved STRSM and DTRSM performance on POWER9 and POWER10 + * Fixed segmemtation faults in DYNAMIC_ARCH builds + * Fixed compilation with the PGI compiler + +x86: + * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 + +x86_64: + * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake + * Improved the performance of SASUM and DASUM kernels through parallelization + * Improved the performance of SROT and DROT kernels + * Improved the performance of multithreaded xSYRK + * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran + (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or + wrong results) + * Fixed miscompilations by old gcc 4.6 + * Fixed misdetection of AVX2 capability in some Sandybridge cpus + * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD + +ARM64: + * Fixed segmemtation faults in DYNAMIC_ARCH builds + +MIPS: + * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA + * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV + * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP + * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) + +SPARC: + * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers + ==================================================================== Version 0.3.12 24-Oct-2020 diff --git a/Makefile b/Makefile index a9af62a22..54dd3be41 100644 --- a/Makefile +++ b/Makefile @@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.arm b/Makefile.arm index fac6b56824..a27b58e84 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp FCOMMON_OPT += -mfpu=vfp endif + +ifdef HAVE_NEON +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon +endif diff --git a/Makefile.install b/Makefile.install index 7c1a3ca43..e8b64465f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) -OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas +OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig @@ -150,13 +150,13 @@ endif endif #Generating openblas.pc - @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" #Generating OpenBLASConfig.cmake diff --git a/Makefile.prebuild b/Makefile.prebuild index 48fb5e991..d6395da7b 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500) TARGET_FLAGS = -mips64r6 endif +ifeq ($(TARGET), C910V) +TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 new file mode 100644 index 000000000..15d7b059c --- /dev/null +++ b/Makefile.riscv64 @@ -0,0 +1,4 @@ +ifeq ($(CORE), C910V) +CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v +FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static +endif diff --git a/Makefile.rule b/Makefile.rule index a4d11dc7c..e4b82104e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12 +VERSION = 0.3.13 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.sparc b/Makefile.sparc index 8895b96dd..61c7aa36d 100644 --- a/Makefile.sparc +++ b/Makefile.sparc @@ -3,21 +3,29 @@ RANLIB = ranlib ifdef BINARY64 +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mcpu=v9 -m64 +else +CCOMMON_OPT += -m64 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 -m64 endif -ifeq ($(COMPILER_F77), f90) -FCOMMON_OPT += -xarch=v9 +ifeq ($(COMPILER_F77), f95) +FCOMMON_OPT += -m64 endif else +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mcpu=v9 +else +CCOMMON_OPT += -xarch=v9 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 endif -ifeq ($(COMPILER_F77), f90) +ifeq ($(COMPILER_F77), f95) FCOMMON_OPT += -xarch=v8plusb endif @@ -37,4 +45,4 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ else LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath -endif \ No newline at end of file +endif diff --git a/Makefile.system b/Makefile.system index 30d8f4ccf..5adde36d8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -6,7 +6,7 @@ INCLUDED = 1 ifndef TOPDIR -TOPDIR = . +TOPDIR = . endif # If ARCH is not set, we use the host system's architecture for getarch compile options. @@ -93,6 +93,12 @@ endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) GETARCH_FLAGS += -DUSER_TARGET +ifeq ($(TARGET), GENERIC) +ifeq ($(DYNAMIC_ARCH), 1) +override NO_EXPRECISION=1 +export NO_EXPRECiSION +endif +endif endif # Force fallbacks for 32bit @@ -246,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else +HAVE_NEON= +HAVE_VFP= +HAVE_VFPV3= +HAVE_VFPV4= +HAVE_MMX= +HAVE_SSE= +HAVE_SSE2= +HAVE_SSE3= +HAVE_SSSE3= +HAVE_SSE4_1= +HAVE_SSE4_2= +HAVE_SSE4A= +HAVE_SSE5= +HAVE_AVX= +HAVE_AVX2= +HAVE_FMA3= include $(TOPDIR)/Makefile_kernel.conf endif @@ -319,6 +341,7 @@ ifeq ($(GCCVERSIONGTEQ7),1) else GCCDUMPVERSION_PARAM := -dumpversion endif +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -602,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 endif +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -649,7 +676,7 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT @@ -728,7 +755,10 @@ endif endif endif - +ifeq ($(ARCH), riscv64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif # @@ -761,14 +791,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -810,7 +835,9 @@ endif ifndef BINARY_DEFINED ifneq ($(OSNAME), AIX) ifdef BINARY64 +ifneq ($(ARCH), riscv64) CCOMMON_OPT += -m64 +endif else CCOMMON_OPT += -m32 endif @@ -855,7 +882,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) ifeq ($(FLANG_VENDOR),AOCC) FCOMMON_OPT += -fno-unroll-loops endif @@ -931,8 +958,10 @@ endif else ifdef BINARY64 ifneq ($(OSNAME), AIX) +ifneq ($(ARCH), riscv64) FCOMMON_OPT += -m64 endif +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -fdefault-integer-8 @@ -1048,11 +1077,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1078,11 +1107,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1101,16 +1130,25 @@ CCOMMON_OPT += -w ifeq ($(ARCH), x86) CCOMMON_OPT += -m32 else -FCOMMON_OPT += -m64 +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif endif endif ifeq ($(F_COMPILER), SUN) CCOMMON_OPT += -DF_INTERFACE_SUN +FCOMMON_OPT += -ftrap=%none -xrecursive ifeq ($(ARCH), x86) FCOMMON_OPT += -m32 else +ifdef BINARY64 FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -xopenmp=parallel @@ -1184,10 +1222,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1262,10 +1298,14 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBSONAMEBASE +LIBSONAMEBASE = openblas +endif + ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) @@ -1279,8 +1319,10 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) ifneq ($(C_COMPILER), PGI) +ifneq ($(C_COMPILER), SUN) CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME endif +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) @@ -1297,11 +1339,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) @@ -1515,6 +1555,8 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_AVX2 +export HAVE_FMA3 export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 @@ -1525,6 +1567,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export NO_AVX2 export BUILD_BFLOAT16 export SBGEMM_UNROLL_M diff --git a/Makefile.x86 b/Makefile.x86 index 330690935..0e27264d8 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,5 +1,10 @@ # COMPILER_PREFIX = mingw32- +ifdef HAVE_SSE +CCOMMON_OPT += -msse +FCOMMON_OPT += -msse +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 @@ -54,9 +59,11 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif - +ifdef HAVE_SSE2 +CCOMMON_OPT += -msse2 +FCOMMON_OPT += -msse2 +endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 ifdef HAVE_SSSE3 @@ -68,5 +75,4 @@ CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif endif -endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index a849f0b01..00967bcb6 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,9 +9,9 @@ endif endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 @@ -20,6 +20,22 @@ ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif +ifndef OLDGCC +ifdef HAVE_AVX +CCOMMON_OPT += -mavx +FCOMMON_OPT += -mavx +endif +endif +ifndef NO_AVX2 +ifdef HAVE_AVX2 +CCOMMON_OPT += -mavx2 +FCOMMON_OPT += -mavx2 +endif +endif +ifndef OLDGCC +ifdef HAVE_FMA3 +CCOMMON_OPT += -mfma +FCOMMON_OPT += -mfma endif endif @@ -47,8 +63,6 @@ ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake @@ -68,15 +82,11 @@ endif endif endif -ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) -ifndef DYNAMIC_ARCH +ifdef HAVE_AVX2 ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) CCOMMON_OPT += -mavx2 endif @@ -101,7 +111,6 @@ endif endif endif endif -endif diff --git a/README.md b/README.md index ca034e747..267df5358 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Z13**: Optimized Level-3 BLAS and Level-1,2 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2 +#### RISC-V + +- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. + ```sh + make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index 66eca4506..d19964916 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -104,3 +104,8 @@ VORTEX ZARCH_GENERIC Z13 Z14 + +10.RISC-V 64: +RISCV64_GENERIC +C910V + diff --git a/benchmark/amax.c b/benchmark/amax.c index 19ae95c8b..29310dd71 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMAX #ifdef COMPLEX #ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) +#define AMAX BLASFUNC(dzamax) #else -#define AMAX BLASFUNC(scamax) +#define AMAX BLASFUNC(scamax) #endif #else #ifdef DOUBLE -#define AMAX BLASFUNC(damax) +#define AMAX BLASFUNC(damax) #else -#define AMAX BLASFUNC(samax) +#define AMAX BLASFUNC(samax) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; + int from = 1; + int to = 200; + int step = 1; - int from = 1; - int to = 200; - int step = 1; + double time1, timeg; - struct timeval start, stop; - double time1,timeg; + argc--; + argv++; - argc--;argv++; + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -152,37 +100,31 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); - fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMIN #ifdef COMPLEX #ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) +#define AMIN BLASFUNC(dzamin) #else -#define AMIN BLASFUNC(scamin) +#define AMIN BLASFUNC(scamin) #endif #else #ifdef DOUBLE -#define AMIN BLASFUNC(damin) +#define AMIN BLASFUNC(damin) #else -#define AMIN BLASFUNC(samin) +#define AMIN BLASFUNC(samin) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; + int from = 1; + int to = 200; + int step = 1; - struct timeval start, stop; - double time1,timeg; + double time1, timeg; - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -151,39 +100,35 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; - fprintf(stderr, " %6d : ", (int)m); + fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef ASUM #ifdef COMPLEX #ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) +#define ASUM BLASFUNC(dzasum) #else -#define ASUM BLASFUNC(scasum) +#define ASUM BLASFUNC(scasum) #endif #else #ifdef DOUBLE -#define ASUM BLASFUNC(dasum) +#define ASUM BLASFUNC(dasum) #else -#define ASUM BLASFUNC(sasum) +#define ASUM BLASFUNC(sasum) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; FLOAT result; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; - -#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) - struct timeval start, stop; - double time1,timeg; -#else - struct timespec start = { 0, 0 }, stop = { 0, 0 }; + int from = 1; + int to = 200; + int step = 1; double time1, timeg; -#endif - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; } + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } #ifdef __linux srandom(getpid()); @@ -158,45 +100,33 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; - fprintf(stderr, " %6d : ", (int)m); + fprintf(stderr, " %6d : ", (int)m); - for (l=0; l1) - timeg /= loops; + if (loops > 1) + timeg /= loops; #ifdef COMPLEX fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif - } return 0; diff --git a/benchmark/axpby.c b/benchmark/axpby.c index 793ee7e40..d02d9a889 100644 --- a/benchmark/axpby.c +++ b/benchmark/axpby.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPBY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -176,16 +104,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -127,8 +56,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timespec start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - clock_gettime( CLOCK_REALTIME, &start); + begin(); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); - clock_gettime( CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; + time1 = getsec(); timeg += time1; diff --git a/benchmark/bench.h b/benchmark/bench.h new file mode 100644 index 000000000..1f9b8986c --- /dev/null +++ b/benchmark/bench.h @@ -0,0 +1,104 @@ +#include +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + + +#define malloc huge_malloc + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + struct timeval start, stop; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; +#endif + +double getsec() +{ +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#else + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; +#endif +} + +void begin() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &start, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &start); +#endif +} + +void end() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &stop, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &stop); +#endif +} \ No newline at end of file diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 5908b6085..65b20d039 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -71,41 +66,6 @@ double fabs(double); #endif #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - - static __inline double getmflops(int ratio, int m, double secs){ double mm = (double)m; @@ -145,7 +105,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1; argc--;argv++; @@ -220,20 +179,19 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - maxerr = 0.; if (!(uplos & 1)) { for (j = 0; j < m; j++) { diff --git a/benchmark/copy.c b/benchmark/copy.c index eb5148fff..c5e447521 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef COPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,11 +57,9 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1 = 0.0, timeg = 0.0; long nanos = 0; time_t seconds = 0; - struct timespec time_start = { 0, 0 }, time_end = { 0, 0 }; argc--;argv++; @@ -176,15 +103,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(ddot) #else #define DOT BLASFUNC(sdot) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -122,7 +49,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -169,15 +95,12 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); result = DOT (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; + end(); + timeg += getsec(); } diff --git a/benchmark/geev.c b/benchmark/geev.c index 4fd2c8d6f..6e22cdfb6 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -36,13 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEEV @@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a, FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info ); #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; @@ -154,7 +83,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -223,7 +151,7 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; #ifndef COMPLEX @@ -239,14 +167,14 @@ int main(int argc, char *argv[]){ GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed to compute eigenvalues .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 8cd14bbed..35f5096f3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ IFLOAT *a, *b; @@ -139,7 +68,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1, timeg; argc--;argv++; @@ -228,14 +156,14 @@ int main(int argc, char *argv[]){ ldc = m; fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k); - gettimeofday( &start, (struct timezone *)0); + begin(); for (j=0; j -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -133,7 +62,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -187,16 +115,12 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; - + end(); + timeg += getsec(); } timeg /= loops; diff --git a/benchmark/gemv.c b/benchmark/gemv.c index fb1f541d3..a0001277a 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GEMV @@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -211,10 +139,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } @@ -248,10 +176,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } diff --git a/benchmark/ger.c b/benchmark/ger.c index d53d328f0..7ce08c3ad 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GER @@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -131,7 +59,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -198,16 +125,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -66,71 +61,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -142,7 +72,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -194,22 +123,18 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GESV (&m, &m, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); - - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - + end(); + time1 = getsec(); fprintf(stderr, "%10.2f MFlops %10.6f s\n", COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); - } return 0; diff --git a/benchmark/getri.c b/benchmark/getri.c index a07014768..98a860906 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GETRF #undef GETRI @@ -72,71 +67,6 @@ extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*work; @@ -148,7 +78,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -205,21 +134,21 @@ int main(int argc, char *argv[]){ exit(1); } - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 60ba9fb89..35249bdf9 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HBMV - #ifdef DOUBLE #define HBMV BLASFUNC(zhbmv) #else #define HBMV BLASFUNC(chbmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -125,7 +52,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -186,15 +112,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; + timeg += getsec(); } diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 2bc165458..a0a9985ad 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMM @@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HEMM BLASFUNC(chemm) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -126,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 98618a04e..ad130ddd0 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMV - #ifdef DOUBLE #define HEMV BLASFUNC(zhemv) #else #define HEMV BLASFUNC(chemv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +108,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/her.c b/benchmark/her.c index 010f8120d..cd1fb7f48 100644 --- a/benchmark/her.c +++ b/benchmark/her.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER - #ifdef DOUBLE #define HER BLASFUNC(zher) #else #define HER BLASFUNC(cher) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -126,8 +53,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timeval start, stop; double time1; argc--;argv++; @@ -166,15 +91,13 @@ int main(int argc, char *argv[]){ x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HER (&uplo, &m, alpha, x, &incx, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2.c b/benchmark/her2.c index 0f80f3ed9..d87bfd466 100644 --- a/benchmark/her2.c +++ b/benchmark/her2.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2 - #ifdef DOUBLE #define HER2 BLASFUNC(zher2) #else #define HER2 BLASFUNC(cher2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,16 +95,13 @@ int main(int argc, char *argv[]){ y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); - + begin(); HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 021873beb..d3cdce696 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2K #ifdef DOUBLE @@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HER2K BLASFUNC(cher2k) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,13 +96,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/herk.c b/benchmark/herk.c index c09d35c1f..628dc2c11 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HERK - #ifdef DOUBLE #define HERK BLASFUNC(zherk) #else #define HERK BLASFUNC(cherk) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -167,18 +93,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); - } return 0; diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index b0157094e..907e2adc4 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HPMV - #ifdef DOUBLE #define HPMV BLASFUNC(zhpmv) #else #define HPMV BLASFUNC(chpmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -183,13 +109,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamax.c b/benchmark/iamax.c index c87044ab4..15618cbcc 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMAX @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamin.c b/benchmark/iamin.c index e7c8e59e4..a57638ecc 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMIN @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imax.c b/benchmark/imax.c index b56ef64ba..b96b17167 100644 --- a/benchmark/imax.c +++ b/benchmark/imax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imin.c b/benchmark/imin.c index 4a92c8bd0..095eacca9 100644 --- a/benchmark/imin.c +++ b/benchmark/imin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 661a44175..202035245 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -72,71 +67,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1, time2; argc--;argv++; @@ -198,31 +127,31 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GETRF (&m, &m, a, &m, ipiv, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - gettimeofday( &start, (struct timezone *)0); + begin(); GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time2 = getsec(); maxerr = 0.; diff --git a/benchmark/max.c b/benchmark/max.c index a19a386a2..301b943a5 100644 --- a/benchmark/max.c +++ b/benchmark/max.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/min.c b/benchmark/min.c index 4df8fb0fd..39df37a29 100644 --- a/benchmark/min.c +++ b/benchmark/min.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c index 0f416621a..cd64d564a 100644 --- a/benchmark/nrm2.c +++ b/benchmark/nrm2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NRM2 @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NRM2 (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/potrf.c b/benchmark/potrf.c index cb4c23bab..116d0cca5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -86,37 +81,7 @@ double fabs(double); // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info); // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif int main(int argc, char *argv[]){ @@ -141,7 +106,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -217,18 +181,18 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrf info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; if ( btest == 'S' ) @@ -240,17 +204,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; } @@ -258,18 +222,18 @@ int main(int argc, char *argv[]){ if ( btest == 'I' ) { - gettimeofday( &start, (struct timezone *)0); + begin(); POTRI(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potri info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; } diff --git a/benchmark/rot.c b/benchmark/rot.c index 69698988d..15b630e36 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROT @@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -133,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -179,13 +108,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROTM @@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ROTM BLASFUNC(srotm) #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) -{ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid = - shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT | 0600)) < 0) { - printf("Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf("Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -122,7 +51,7 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timeval start, stop; + double time1, timeg; argc--; @@ -188,14 +117,13 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - gettimeofday(&start, (struct timezone *)0); + begin(); ROTM(&m, x, &inc_x, y, &inc_y, param); - gettimeofday(&stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + - (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/scal.c b/benchmark/scal.c index 8bd62c77c..8de6cfd04 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SCAL @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +57,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -174,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SCAL (&m, alpha, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spmv.c b/benchmark/spmv.c index cff504d3b..e4dcbf4ae 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPMV - #ifndef COMPLEX #ifdef DOUBLE @@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -135,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -193,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spr.c b/benchmark/spr.c index 5dcaa4f8b..2fc9994f8 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPR @@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SPR BLASFUNC(sspr) #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*c; @@ -129,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -173,13 +99,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR (&uplo, &m, alpha, c, &inc_x, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/spr2.c b/benchmark/spr2.c index a5f2791f7..8f194e83a 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SPR2 @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*b,*c; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +110,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/swap.c b/benchmark/swap.c index 76d545995..64ebe5e9b 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SWAP @@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +104,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SWAP (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/symm.c b/benchmark/symm.c index bb9849eb5..1c6d91d00 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/symv.c b/benchmark/symv.c index e4c892b5a..0a35aaef0 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMV @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -134,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -192,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/syr.c b/benchmark/syr.c index a9dd293e6..ebbf2bd3c 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x,*a; @@ -124,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -165,13 +93,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2.c b/benchmark/syr2.c index 9efbca315..acbc86987 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYR2 @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYR2 BLASFUNC(ssyr2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y, *a; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -174,13 +101,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index a906559eb..3895c2861 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR2K @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +67,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +110,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 0fbb943f6..82606a21a 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYRK @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -177,13 +105,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c index fe9d07534..41f2e0fb8 100644 --- a/benchmark/tpmv.c +++ b/benchmark/tpmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c index 8472ac261..ebfa29692 100644 --- a/benchmark/tpsv.c +++ b/benchmark/tpsv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPSV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 23af122b4..3ab9fc255 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -141,7 +71,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -180,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", diff --git a/benchmark/trmv.c b/benchmark/trmv.c index 46641b3e4..0e8088b54 100644 --- a/benchmark/trmv.c +++ b/benchmark/trmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 17676946a..d2ebd7f54 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRSM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -196,13 +125,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 1734e2adb..66ac3a3c7 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include -#include "common.h" - +#include "bench.h" #undef GEMV #undef TRSV @@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -133,7 +61,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timespec time_start, time_end; time_t seconds = 0; double time1,timeg; @@ -189,19 +116,13 @@ int main(int argc, char *argv[]){ for(l =0;l< loops;l++){ - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); - + begin(); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); - - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); - nanos = time_end.tv_nsec - time_start.tv_nsec; - seconds = time_end.tv_sec - time_start.tv_sec; - - time1 = seconds + nanos /1.e9; + end(); + time1 = getsec(); timeg += time1; } - timeg /= loops; long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c index ba1515365..06cdde13a 100644 --- a/benchmark/zdot-intel.c +++ b/benchmark/zdot-intel.c @@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#define RETURN_BY_STACK 1 -#include "common.h" +#include "bench.h" +#define RETURN_BY_STACK 1 #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(zdotu) #else #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -123,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); DOT (&result, &m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/zdot.c b/benchmark/zdot.c index fa624e859..23b3efcad 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -122,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -169,15 +96,15 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); #ifdef RETURN_BY_STACK DOT (&result , &m, x, &inc_x, y, &inc_y ); #else result = DOT (&m, x, &inc_x, y, &inc_y ); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/c_check b/c_check index 5ea93b75c..970d475d7 100644 --- a/c_check +++ b/c_check @@ -6,7 +6,8 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); -$hostarch = `uname -p` if ($hostos eq "AIX"); +$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); +chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); @@ -92,6 +93,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $defined = 0; @@ -136,6 +138,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } +if ($architecture eq "riscv64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -192,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } else { $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; @@ -270,6 +277,15 @@ if ($data =~ /HAVE_C11/) { } } +if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { + $no_avx2 = 0; + $oldgcc = 0; + $data = `$compiler_name -dumpversion`; + if ($data <= 4.6) { + $no_avx2 = 1; + $oldgcc = 1; + } +} $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; @@ -362,6 +378,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; +print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; +print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; diff --git a/cblas.h b/cblas.h index bf310bed2..da00d46d6 100644 --- a/cblas.h +++ b/cblas.h @@ -393,6 +393,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 2f4d1c6d7..76952152b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") endif () endif () -if (${CORE} STREQUAL "SKYLAKEX") +if (${CORE} STREQUAL SKYLAKEX) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") @@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () -if (${CORE} STREQUAL "COOPERLAKE") +if (${CORE} STREQUAL COOPERLAKE) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) @@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () + if (HAVE_FMA3) + set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 7d7f5ffda..0c102bae5 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -184,8 +184,8 @@ macro(SetDefaultL2) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../arm/gemv_n.c) - set(SBGEMVTKERNEL ../arm/gemv_t.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) set(SHGERKERNEL ../generic/ger.c) endif () endmacro () diff --git a/cmake/os.cmake b/cmake/os.cmake index c644bc3f7..e24059dd5 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -84,6 +84,14 @@ if (X86) set(NO_EXPRECISION 1) endif () +if (DYNAMIC_ARCH) +if (TARGET) +if (${TARGET} STREQUAL "GENERIC") + set(NO_EXPRECISION 1) +endif () +endif () +endif () + if (UTEST_CHECK) set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") set(SANITY_CHECK 1) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3e38abbf5..da7686c33 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -556,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING) MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") endif () endif () + unset (HAVE_AVX2) + unset (HAVE_AVX) + unset (HAVE_FMA3) + unset (HAVE_MMX) + unset (HAVE_SSE) + unset (HAVE_SSE2) + unset (HAVE_SSE3) + unset (HAVE_SSSE3) + unset (HAVE_SSE4A) + unset (HAVE_SSE4_1) + unset (HAVE_SSE4_2) + unset (HAVE_NEON) + unset (HAVE_VFP) + unset (HAVE_VFPV3) + unset (HAVE_VFPV4) message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way diff --git a/cmake/system.cmake b/cmake/system.cmake index 4cc46236d..66e95c6d3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -44,50 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () -if (DEFINED TARGET) - if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") - else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() - endif() - if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() - if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() - endif() - if (DEFINED HAVE_SSE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") - endif() - if (DEFINED HAVE_SSE2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") - endif() - if (DEFINED HAVE_SSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (DEFINED HAVE_SSSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") - endif() - if (DEFINED HAVE_SSE4_1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") - endif() -endif() if (DEFINED TARGET) + message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --") message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () @@ -187,6 +146,63 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") +if (DEFINED TARGET) + if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") +# endif() + endif() + if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_AVX) + if (NOT NO_AVX) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") + endif() + endif() + if (DEFINED HAVE_AVX2) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_FMA3) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + endif() + endif() + if (DEFINED HAVE_SSE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") + endif() + if (DEFINED HAVE_SSE2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") + endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (DEFINED HAVE_SSSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") + endif() + if (DEFINED HAVE_SSE4_1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") + endif() +endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") endif () diff --git a/common.h b/common.h index a3ef99b59..2825407cb 100644 --- a/common.h +++ b/common.h @@ -437,6 +437,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif + +#ifdef ARCH_RISCV64 +#include "common_riscv64.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif diff --git a/common_arm64.h b/common_arm64.h index 314946282..9cdded305 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -142,14 +142,8 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(CORTEXA57) -#define BUFFER_SIZE (20 << 20) -#elif defined(TSV110) || defined(EMAG8180) #define BUFFER_SIZE (32 << 20) #else -#define BUFFER_SIZE (16 << 20) -#endif -#else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif diff --git a/common_interface.h b/common_interface.h index 032877fe1..b9ebb2772 100644 --- a/common_interface.h +++ b/common_interface.h @@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *, + bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, diff --git a/common_level2.h b/common_level2.h index 640d4a073..9a5ebb4d9 100644 --- a/common_level2.h +++ b/common_level2.h @@ -44,6 +44,10 @@ extern "C" { #endif +int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); +int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_macro.h b/common_macro.h index 54deed57c..c6ea1bfd9 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,10 +646,12 @@ #elif defined(BFLOAT16) -#define D_TO_BF16_K SBDTOBF16_K -#define D_BF16_TO_K DBF16TOD_K -#define S_TO_BF16_K SBSTOBF16_K -#define S_BF16_TO_K SBF16TOS_K +#define D_TO_BF16_K SBDTOBF16_K +#define D_BF16_TO_K DBF16TOD_K +#define S_TO_BF16_K SBSTOBF16_K +#define S_BF16_TO_K SBF16TOS_K +#define SBGEMV_N SBGEMV_N_K +#define SBGEMV_T SBGEMV_T_K #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/common_param.h b/common_param.h index b50e4ff80..3e3ae06f8 100644 --- a/common_param.h +++ b/common_param.h @@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); + int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); diff --git a/common_power.h b/common_power.h index a61e4e28a..a49197fd7 100644 --- a/common_power.h +++ b/common_power.h @@ -849,6 +849,10 @@ Lmcount$lazy_ptr: #else #define BUFFER_SIZE ( 16 << 20) #endif +#ifdef DYNAMIC_ARCH +#undef BUFFER_SIZE +#define BUFFER_SIZE (64 << 22) +#endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) diff --git a/common_riscv64.h b/common_riscv64.h new file mode 100644 index 000000000..27f385dfd --- /dev/null +++ b/common_riscv64.h @@ -0,0 +1,98 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_RISCV64 +#define COMMON_RISCV64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#if defined(C910V) +#include +#endif + +#endif diff --git a/common_sb.h b/common_sb.h index 66968ab00..9976e812e 100644 --- a/common_sb.h +++ b/common_sb.h @@ -8,6 +8,8 @@ #define SBDTOBF16_K sbdtobf16_k #define SBF16TOS_K sbf16tos_k #define DBF16TOD_K dbf16tod_k +#define SBGEMV_N_K sbgemv_n +#define SBGEMV_T_K sbgemv_t #define SBGEMM_ONCOPY sbgemm_oncopy #define SBGEMM_OTCOPY sbgemm_otcopy @@ -29,6 +31,8 @@ #define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBF16TOS_K gotoblas -> sbf16tos_k #define DBF16TOD_K gotoblas -> dbf16tod_k +#define SBGEMV_N_K gotoblas -> sbgemv_n +#define SBGEMV_T_K gotoblas -> sbgemv_t #define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy #define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy diff --git a/common_sparc.h b/common_sparc.h index 85e29fffa..90a24ebf1 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -78,6 +78,12 @@ static __inline unsigned long rpcc(void){ #define __BIG_ENDIAN__ #endif +#ifdef C_SUN +#ifndef __64BIT +#define RETURN_BY_STACK +#endif +#endif + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") #else diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..674b65908 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -140,14 +105,16 @@ int detect(void){ } fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; } #endif return CPU_UNKNOWN; + } } char *get_corename(void){ @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -237,10 +204,10 @@ void get_cpuconfig(void){ } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c new file mode 100644 index 000000000..0eb50e001 --- /dev/null +++ b/cpuid_riscv64.c @@ -0,0 +1,113 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 +#define CPU_C910V 1 + +static char *cpuname[] = { + "UNKOWN", + "C910V" +}; + +int detect(void){ + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("RISCV64"); +} + +void get_subarchitecture(void){ +} + +void get_subdirname(void){ + printf("riscv64"); +} + +void get_cpuconfig(void){ + printf("#define UNKNOWN\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); +} + +void get_libname(void){ + printf("riscv64\n"); +} diff --git a/cpuid_x86.c b/cpuid_x86.c index 728d459d1..84c12ff43 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -202,7 +202,7 @@ int support_avx(){ if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); if((eax & 6) == 6){ - ret=1; //OS support AVX + ret=1; //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2)) } } return ret; @@ -219,8 +219,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //CPU supports AVX2 return ret; #else return 0; @@ -235,14 +235,14 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & 32) != 32){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } + ret=1; //OS supports saving zmm registers + } return ret; #else return 0; diff --git a/ctest.c b/ctest.c index cd84ab1bb..d674a8cbd 100644 --- a/ctest.c +++ b/ctest.c @@ -153,6 +153,11 @@ ARCH_ARM ARCH_ARM64 #endif +#if defined(__riscv) +ARCH_RISCV64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif + diff --git a/ctest/Makefile b/ctest/Makefile index cba904f75..2a893cae8 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -61,7 +61,7 @@ endif all1: $(all1targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat1 @@ -106,7 +106,7 @@ endif all2: $(all2targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 @@ -152,7 +152,7 @@ endif all3: $(all3targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 7212d6662..caecf4f97 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -413,7 +413,13 @@ XBLASOBJS += \ xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ - xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ + xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_thread_t$(TSUFFIX).$(SUFFIX) +endif endif @@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) +sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + + include ../../Makefile.tail diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c new file mode 100644 index 000000000..534c60f95 --- /dev/null +++ b/driver/level2/sbgemv_thread.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANSA +#define SBGEMV SBGEMV_N +#else +#define SBGEMV SBGEMV_T +#endif + +static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){ + + bfloat16 *a, *x; + float *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to, n_from, n_to; + + a = (bfloat16 *)args->a; + x = (bfloat16 *)args->b; + y = (float *)args->c; + + lda = args->lda; + incx = args->ldb; + incy = args->ldc; + +#ifndef TRANSA // N + m_from = *(range_m + 0); + m_to = *(range_m + 1); + n_from = 0; + n_to = args -> n; + a += m_from; + y += m_from * incy; +#else // T + m_from = 0; + m_to = args->m; + n_from = *(range_n + 0); + n_to = *(range_n + 1); + a += n_from * lda; + y += n_from * incy; +#endif + + SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy); + + return 0; +} + +int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads) +{ + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + +#ifndef TRANSA + BLASLONG width_for_split = m; +#else + BLASLONG width_for_split = n; +#endif + + BLASLONG BLOCK_WIDTH = width_for_split/threads; + + int mode = BLAS_BFLOAT16 | BLAS_REAL; + + args.m = m; + args.n = n; + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)y; + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + args.alpha = (void *)α + args.beta = (void *)β + + range[0] = 0; + + int thread_idx; + + for (thread_idx=0; thread_idx= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; +/* + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6e1fd9e99..2b33c9589 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else +/* if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif /* Copy part of local region of B into workspace */ diff --git a/driver/others/Makefile b/driver/others/Makefile index 7558ec058..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -7,7 +7,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) ifdef SMP COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) -ifndef NO_AFFINITY +ifneq ($(NO_AFFINITY), 1) COMMONOBJS += init.$(SUFFIX) endif endif @@ -24,19 +24,23 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif -ifdef EXPRECISION +ifeq ($(EXPRECISION), 1) COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) endif -ifdef QUAD_PRECISION +ifeq ($(QUAD_PRECISION), 1) COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) endif @@ -46,11 +50,9 @@ ifeq ($(C_COMPILER), PGI) endif endif -ifdef USE_CUDA ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif -endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) @@ -94,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 04acbcc5f..06039c952 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha break; } - mode |= BLAS_LEGACY; + if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..5e0943c2e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d546553c1..a576127aa 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #endif -void goto_set_num_threads(int num_threads) { +static void adjust_thread_buffers() { int i=0, j=0; + //adjust buffer for each thread + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { + for(j=0; j < blas_cpu_number; j++){ + if(blas_thread_buffer[i][j] == NULL){ + blas_thread_buffer[i][j] = blas_memory_alloc(2); + } + } + for(; j < MAX_CPU_NUMBER; j++){ + if(blas_thread_buffer[i][j] != NULL){ + blas_memory_free(blas_thread_buffer[i][j]); + blas_thread_buffer[i][j] = NULL; + } + } + } +} + +void goto_set_num_threads(int num_threads) { + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); - //adjust buffer for each thread - for(i=0; isb=sb; } } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 4624085d5..42f289441 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ routine = queue -> routine; - if (!(queue -> mode & BLAS_LEGACY)) { + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); - } else { - legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 21d2c7948..58f4d8b59 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -330,8 +330,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //AVX2 flag is set return ret; #else return 0; @@ -346,13 +346,13 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) == 0){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1u<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag is set xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL + ret=1; //OS supports saving zmm register } return ret; #else diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index be22b247c..4f1b12f27 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) { static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; + char coremsg[128]; + +#if (!defined OS_LINUX && !defined OS_ANDROID) + return NULL; +#else -#if (defined OS_LINUX || defined OS_ANDROID) if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { - char coremsg[128]; +#ifdef __linux + FILE *infile; + char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; + p = (char *) NULL ; + infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); + if (!infile) return NULL; + fgets(buffer, sizeof(buffer), infile); + midr_el1=strtoul(buffer,NULL,16); + fclose(infile); +#else snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); return NULL; - } -#else - return NULL; #endif - - get_cpu_ftr(MIDR_EL1, midr_el1); + } else { + get_cpu_ftr(MIDR_EL1, midr_el1); + } /* * MIDR_EL1 * @@ -219,8 +230,12 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + default: + snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); + openblas_warning(1, coremsg); } return NULL; +#endif } void gotoblas_dynamic_init(void) { diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 85fc5b3ba..a2f56d839 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -52,6 +52,11 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 6) + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; +#endif return NULL; } diff --git a/driver/others/memory.c b/driver/others/memory.c index ba2bb55b9..f0521ab2d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1767,11 +1767,11 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..36da13369 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -717,7 +717,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +731,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/exports/gensymbol b/exports/gensymbol index 22e470da5..857a17a9e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/f_check b/f_check index f894aa9ac..42241ae10 100644 --- a/f_check +++ b/f_check @@ -33,7 +33,7 @@ if ($compiler eq "") { "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", "pgf95", "pgf90", "pgf77", - "flang", + "flang", "egfortran", "ifort"); OUTER: @@ -69,7 +69,12 @@ if ($compiler eq "") { $bu = "_"; } - if ($data =~ /GNU/ || $data =~ /GCC/ ) { + if ($data =~ /Fujitsu/) { + + $vendor = FUJITSU; + $openmp = "-Kopenmp"; + + } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; @@ -325,6 +330,9 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + $flags = "-lomp"; + } if ( ($flags =~ /^\-l/) @@ -337,8 +345,8 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) - && ($flags !~ /[0-9]+/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/)) + && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/)) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; diff --git a/getarch.c b/getarch.c index 3f1448305..29671736e 100644 --- a/getarch.c +++ b/getarch.c @@ -97,9 +97,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__x86_64__) || defined(_M_X64) #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else +#ifndef NO_AVX512 #define NO_AVX512 #endif #endif +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -138,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -324,16 +326,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif +#endif #ifdef FORCE_SKYLAKEX #ifdef NO_AVX512 @@ -346,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -359,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif @@ -376,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -389,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" #define LIBNAME "cooperlake" #define CORENAME "COOPERLAKE" #endif @@ -549,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ @@ -559,10 +582,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" + "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "zen" #define CORENAME "ZEN" #endif +#endif #ifdef FORCE_SSE_GENERIC @@ -790,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" #else #endif @@ -981,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1266,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1320,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED @@ -1405,8 +1462,41 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); -#if defined(__arm__) && !defined(FORCE) +#if defined(__arm__) +#if !defined(FORCE) + fprintf(stderr,"get features!\n"); get_features(); +#else + fprintf(stderr,"split archconfig!\n"); + sprintf(buffer, "%s", ARCHCONFIG); + + p = &buffer[0]; + + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + if (*p != 'H') { + while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; } + if (*p == '-') continue; + } + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf("="); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + if ((*p == ' ') || (*p =='\0')) printf("=1\n"); + } + } + } else p ++; + } +#endif #endif diff --git a/interface/Makefile b/interface/Makefile index 6b247b49f..597956fdb 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -19,7 +19,7 @@ ifeq ($(ARCH), MIPS) SUPPORT_GEMM3M = 1 endif -ifndef NO_FBLAS +ifneq ($(NO_FBLAS), 1) SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ @@ -48,6 +48,7 @@ SBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS2OBJS = sbgemv.$(SUFFIX) SBBLAS3OBJS = sbgemm.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -145,7 +146,7 @@ ZBLAS3OBJS += zgemm3m.$(SUFFIX) endif -ifdef EXPRECISION +ifeq ($(EXPRECISION), 1) QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ @@ -284,6 +285,7 @@ CSBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -382,6 +384,7 @@ SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS2OBJS += $(CSBBLAS2OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) @@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -507,12 +510,12 @@ ifneq ($(BUILD_COMPLEX16),1) endif FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -$(info FUNCOBJS = {[$(FUNCOBJS)]} ) -ifdef EXPRECISION + +ifeq ($(EXPRECISION), 1) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif -ifdef QUAD_PRECISION +ifeq ($(QUAD_PRECISION), 1) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif @@ -538,7 +541,7 @@ clean :: level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) +level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) @@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -c $< -o $(@F) +endif + ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< @@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< diff --git a/interface/gemv.c b/interface/gemv.c index c9d52cd69..d5d739fb1 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order, } #endif - //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; diff --git a/interface/sbgemv.c b/interface/sbgemv.c new file mode 100644 index 000000000..89debe82d --- /dev/null +++ b/interface/sbgemv.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "l1param.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#define ERROR_NAME "SBGEMV " + +#ifdef SMP +static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY) +{ + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + float alpha = *ALPHA; + float beta = *BETA; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') {i = 0;} + if (trans == 'T') {i = 1;} + if (trans == 'R') {i = 0;} + if (trans == 'C') {i = 1;} + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (i < 0) {info = 1;} + + trans = i; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy) +{ + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { // Column Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 0; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 1; + } + } else { // Row Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 1; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 0; + } + + t = n; + n = m; + m = t; + } + + info = -1; + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (trans < 0) {info = 1;} + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + if (trans) { + lenx = m; + leny = n; + } else { + lenx = n; + leny = m; + } + + if (alpha == ZERO) { + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); + return; + } + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) {x -= (lenx - 1) * incx;} + if (incy < 0) {y -= (leny - 1) * incy;} + +#ifdef SMP + int thread_thres_row = 20480; + if (trans) { + if (n <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } else { + if (m <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } + + + if (nthreads == 1) { +#endif + (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy); +#ifdef SMP + } else { + (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads); + } +#endif + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + IDEBUG_END; + + return; +} diff --git a/kernel/Makefile b/kernel/Makefile index e52781c6d..4e86546b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,18 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifdef HAVE_SSE3 -CFLAGS += -msse3 -endif -ifdef HAVE_SSSE3 -CFLAGS += -mssse3 -endif - -ifeq ($(C_COMPILER), GCC) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as @@ -26,20 +14,14 @@ endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif endif ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) - GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) - GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) - GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) - GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) + GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif @@ -49,12 +31,6 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 -endif - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) - override CFLAGS += -msse -msse2 -endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) @@ -82,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -92,6 +70,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 79399c342..888a9b959 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL XGEMVTKERNEL = zgemv_t.S endif +ifeq ($(BUILD_BFLOAT16),1) +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = ../x86_64/sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = ../x86_64/sbgemv_t.c +endif +endif + ### GER ### ifndef SGERKERNEL @@ -234,6 +244,12 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_t$(TSUFFIX).$(SUFFIX) +endif + ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2ba593c2e..d8d739965 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -25,7 +25,7 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) +ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index 63584b95c..a486a1868 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) n *= inc_x; if (inc_x == 1) { -#if V_SIMD +#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128)) #ifdef DOUBLE const int vstep = v_nlanes_f64; - const int unrollx2 = n & (-vstep * 2); + const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; v_f64 vsum0 = v_zero_f64(); v_f64 vsum1 = v_zero_f64(); - while (i < unrollx2) - { - vsum0 = v_add_f64(vsum0, v_loadu_f64(x)); - vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep)); - i += vstep * 2; - } - vsum0 = v_add_f64(vsum0, vsum1); - while (i < unrollx) + v_f64 vsum2 = v_zero_f64(); + v_f64 vsum3 = v_zero_f64(); + for (; i < unrollx4; i += vstep * 4) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); + vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep)); + vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2)); + vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3)); + } + vsum0 = v_add_f64( + v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3)); + for (; i < unrollx; i += vstep) { vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); - i += vstep; } sumf = v_sum_f64(vsum0); #else @@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_f32 vsum1 = v_zero_f32(); v_f32 vsum2 = v_zero_f32(); v_f32 vsum3 = v_zero_f32(); - while (i < unrollx4) + for (; i < unrollx4; i += vstep * 4) { - vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); - vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); - vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); - vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); - i += vstep * 4; + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3)); } vsum0 = v_add_f32( v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); - while (i < unrollx) + for (; i < unrollx; i += vstep) { vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); - i += vstep; } sumf = v_sum_f32(vsum0); #endif diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index ba0e57eb5..9249b54f8 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) +#if !defined(__PPC__) && !defined(__SunOS) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__POWER__) +#if !defined(__PPC__) && !defined(__SunOS) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c new file mode 100644 index 000000000..7ea4e108c --- /dev/null +++ b/kernel/generic/trmmkernel_16x4.c @@ -0,0 +1,2092 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c index 7b1f02477..67e97f710 100644 --- a/kernel/mips/dswap_msa.c +++ b/kernel/mips/dswap_msa.c @@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c index 46fa8aa87..d412285b0 100644 --- a/kernel/mips/sswap_msa.c +++ b/kernel/mips/sswap_msa.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c index 669c25758..97a80b4ba 100644 --- a/kernel/mips/zgemv_n_msa.c +++ b/kernel/mips/zgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c index e6febb577..6492f90be 100644 --- a/kernel/mips/zgemv_t_msa.c +++ b/kernel/mips/zgemv_t_msa.c @@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP3 #undef OP4 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define ZGEMV_T_8x1() \ diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 86df7e3a2..d61f5194a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power10.c -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = dgemm_ncopy_4_power8.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = dgemm_ncopy_8_power10.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c +DTRSMKERNEL_LT = trsm_kernel_LT_power10.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -141,13 +141,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -SAXPYKERNEL = saxpy.c +SAXPYKERNEL = saxpy_power10.c DAXPYKERNEL = daxpy_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power9.S -else -CAXPYKERNEL = caxpy.c -endif +CAXPYKERNEL = caxpy_power10.c ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy_power10.c @@ -155,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy_power10.c ZCOPYKERNEL = zcopy_power10.c # -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -DSDOTKERNEL = sdot.c +SDOTKERNEL = sdot_power10.c +DDOTKERNEL = ddot_power10.c +DSDOTKERNEL = sdot_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CDOTKERNEL = cdot_power9.S else diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index ab8fbfcd9..2bd2516de 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c new file mode 100644 index 000000000..56a5ab47a --- /dev/null +++ b/kernel/power/caxpy_microk_power10.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8 (long n, float *x, float *y, + float alpha_r, float alpha_i) +{ +#if !defined(CONJ) + static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + const float *mvecp = mvec; + /* We have to load reverse mask for big endian. */ + /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ + + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + long ytmp; + + __asm__ + ( + "xscvdpspn 32, %7 \n\t" + "xscvdpspn 33, %8 \n\t" + "xxspltw 32, 32, 0 \n\t" + "xxspltw 33, 33, 0 \n\t" + "lxvd2x 36, 0, %9 \n\t" // mvec + +#if !defined(CONJ) + "xvmulsp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmulsp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + "mr %4, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + + "lxvp 48, 0(%3) \n\t" // y0 + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=b" (ytmp) // 4 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 7 + "d" (alpha_i), // 8 + "4" (mvecp), // 9 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59" + ); +} diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c new file mode 100644 index 000000000..14b8cda67 --- /dev/null +++ b/kernel/power/caxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "caxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + caxpy_kernel_8 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); + i += n1; - i = n1; while(i < n) { diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c new file mode 100644 index 000000000..3a9865cc0 --- /dev/null +++ b/kernel/power/ddot_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static double ddot_kernel_8 (long n, double *x, double *y) +{ + double dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + + "xsadddp %x0, 32, 33 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=d" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c new file mode 100644 index 000000000..302dceb68 --- /dev/null +++ b/kernel/power/ddot_power10.c @@ -0,0 +1,130 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "ddot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + dot = ddot_kernel_8(n1, x, y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b2a29140e..b531799a6 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif ) { - BLASLONG N = n; BLASLONG i1; #if defined(TRMMKERNEL) BLASLONG off; @@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, off = -offset; #endif v4sf_t valpha = { alpha, alpha }; - N = n >> 2; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + for (j = 0; j < (m >> 3); j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rb = (vec_t *) & BO[0]; + __vector_pair rowB, rowB1; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + SAVE_ACC (&acc4, 4); + SAVE_ACC1 (&acc5, 4); + SAVE_ACC (&acc6, 6); + SAVE_ACC1 (&acc7, 6); + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + if (m & 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + if (m & 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + if (m & 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; + v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; + v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; + v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA * rowB1; + t2 += rowA * rowB2; + t3 += rowA * rowB3; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; + CO[4 * ldc] = t2[0]; + CO[5 * ldc] = t2[1]; + CO[6 * ldc] = t3[0]; + CO[7 * ldc] = t3[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; + CO[4 * ldc] += t2[0]; + CO[5 * ldc] += t2[1]; + CO[6 * ldc] += t3[0]; + CO[7 * ldc] += t3[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + B += k << 3; + } + if (n & 4) + { + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) @@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 4); -#else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - BLASLONG l = 0; - PREFETCH1 (CO, 0); - PREFETCH1 (CO + ldc, 0); - PREFETCH1 (CO + ldc + ldc, 0); - PREFETCH1 (CO + ldc + ldc + ldc, 0); - PREFETCH1 (CO, 128); - PREFETCH1 (CO + ldc, 128); - PREFETCH1 (CO + ldc + ldc, 128); - PREFETCH1 (CO + ldc + ldc + ldc, 128); - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 4]; - rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC (&acc2, 4); - SAVE_ACC (&acc1, 2); - SAVE_ACC (&acc3, 6); - SAVE_ACC (&acc4, 8); - SAVE_ACC (&acc6, 12); - SAVE_ACC (&acc5, 10); - SAVE_ACC (&acc7, 14); - AO += temp << 4; - BO += temp << 2; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 4) -#endif - CO += 16; - } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { - BLASLONG i, j, temp; + BLASLONG j, temp; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc << 1; AO = A; - i = m >> 4; - for (j = 0; j < i; j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 2); -#else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); - for (l = 1; l < temp; l++) - { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - rowA = (vec_t *) & AO[l << 4]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); - } - SAVE2x4_ACC (&acc0, 0); - SAVE2x4_ACC (&acc1, 2); - SAVE2x4_ACC (&acc2, 4); - SAVE2x4_ACC (&acc3, 6); - SAVE2x4_ACC (&acc4, 8); - SAVE2x4_ACC (&acc5, 10); - SAVE2x4_ACC (&acc6, 12); - SAVE2x4_ACC (&acc7, 14); - CO += 16; - AO += temp << 4; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 2) -#endif - } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 1) -#else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - v4sf_t t2 = { 0, 0 }; - v4sf_t t3 = { 0, 0 }; - v4sf_t t4 = { 0, 0 }; - v4sf_t t5 = { 0, 0 }; - v4sf_t t6 = { 0, 0 }; - v4sf_t t7 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; - v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; - v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; - v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; - v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; - v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; - v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; - v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; - t4 += rowA4 * rowB; - t5 += rowA5 * rowB; - t6 += rowA6 * rowB; - t7 += rowA7 * rowB; - } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; - t4 = t4 * valpha; - t5 = t5 * valpha; - t6 = t6 * valpha; - t7 = t7 * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; - CO[2] = t1[0]; - CO[3] = t1[1]; - CO[4] = t2[0]; - CO[5] = t2[1]; - CO[6] = t3[0]; - CO[7] = t3[1]; - CO[8] = t4[0]; - CO[9] = t4[1]; - CO[10] = t5[0]; - CO[11] = t5[1]; - CO[12] = t6[0]; - CO[13] = t6[1]; - CO[14] = t7[0]; - CO[15] = t7[1]; -#else - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t1[0]; - CO[3] += t1[1]; - CO[4] += t2[0]; - CO[5] += t2[1]; - CO[6] += t3[0]; - CO[7] += t3[1]; - CO[8] += t4[0]; - CO[9] += t4[1]; - CO[10] += t5[0]; - CO[11] += t5[1]; - CO[12] += t6[0]; - CO[13] += t6[1]; - CO[14] += t7[0]; - CO[15] += t7[1]; -#endif - AO += temp << 4; - BO += temp; - CO += 16; - i -= 16; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 1) -#endif - } - while (i >= 8) + for (i = 0; i < (m >> 3); i++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c new file mode 100644 index 000000000..9836c2e7f --- /dev/null +++ b/kernel/power/dgemm_ncopy_8_power10.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include +#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp09, ctemp17, ctemp33; + IFLOAT ctemp25, ctemp41; + IFLOAT ctemp49, ctemp57; + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + PREFETCHA (aoffset5, 384); + PREFETCHA (aoffset6, 384); + PREFETCHA (aoffset7, 384); + PREFETCHA (aoffset8, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset1 + 4); + __vector double va3 = *(__vector double*)(aoffset1 + 6); + + __vector double va4 = *(__vector double*)(aoffset2 + 0); + __vector double va5 = *(__vector double*)(aoffset2 + 2); + __vector double va6 = *(__vector double*)(aoffset2 + 4); + __vector double va7 = *(__vector double*)(aoffset2 + 6); + + __vector double va8 = *(__vector double*)(aoffset3 + 0); + __vector double va9 = *(__vector double*)(aoffset3 + 2); + __vector double va10 = *(__vector double*)(aoffset3 + 4); + __vector double va11 = *(__vector double*)(aoffset3 + 6); + + __vector double va12 = *(__vector double*)(aoffset4 + 0); + __vector double va13 = *(__vector double*)(aoffset4 + 2); + __vector double va14 = *(__vector double*)(aoffset4 + 4); + __vector double va15 = *(__vector double*)(aoffset4 + 6); + + __vector double va16 = *(__vector double*)(aoffset5 + 0); + __vector double va17 = *(__vector double*)(aoffset5 + 2); + __vector double va18 = *(__vector double*)(aoffset5 + 4); + __vector double va19 = *(__vector double*)(aoffset5 + 6); + + __vector double va20 = *(__vector double*)(aoffset6 + 0); + __vector double va21 = *(__vector double*)(aoffset6 + 2); + __vector double va22 = *(__vector double*)(aoffset6 + 4); + __vector double va23 = *(__vector double*)(aoffset6 + 6); + + __vector double va24 = *(__vector double*)(aoffset7 + 0); + __vector double va25 = *(__vector double*)(aoffset7 + 2); + __vector double va26 = *(__vector double*)(aoffset7 + 4); + __vector double va27 = *(__vector double*)(aoffset7 + 6); + + __vector double va28 = *(__vector double*)(aoffset8 + 0); + __vector double va29 = *(__vector double*)(aoffset8 + 2); + __vector double va30 = *(__vector double*)(aoffset8 + 4); + __vector double va31 = *(__vector double*)(aoffset8 + 6); + + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3); + + *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0); + *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0); + *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0); + *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0); + *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3); + *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3); + *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3); + *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3); + + *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0); + *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0); + *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0); + *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0); + *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3); + *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3); + *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3); + *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3); + + *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0); + *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0); + *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0); + *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0); + *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3); + *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3); + *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3); + *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3); + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset2 + 0); + __vector double va3 = *(__vector double*)(aoffset2 + 2); + __vector double va4 = *(__vector double*)(aoffset3 + 0); + __vector double va5 = *(__vector double*)(aoffset3 + 2); + __vector double va6 = *(__vector double*)(aoffset4 + 0); + __vector double va7 = *(__vector double*)(aoffset4 + 2); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset2 + 0); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3); + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index 4be8a5f9b..e47de2cb5 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -/************************************************************************************** -* 2016/03/30 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) @@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } +static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + + double *a0; + double *a1; + double *a2; + double *a3; + double *a4; + double *a5; + double *a6; + double *a7; + long tmp; + __asm__ + ( + "lxvp 34, 0( %15) \n\t" // x0, x1 + "lxvp 38, 32( %15) \n\t" // x4, x5 + + XXSPLTD_S(58,%x14,0) // alpha, alpha + "sldi %10, %17, 3 \n\t" // lda * sizeof (double) + "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha + "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha + "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha + + "li %11, 32 \n\t" + + "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda + "add %10, %10, %10 \n\t" // 2 * lda + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha + + "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda + "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda + "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda + "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda + "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda + + "lxvp 40, 0( %3) \n\t" // a0[0], a0[1] + "lxvp 42, 0( %4) \n\t" // a1[0], a1[1] + "lxvp 44, 0( %5) \n\t" // a2[0], a2[1] + "lxvp 46, 0( %6) \n\t" // a3[0], a3[1] + "lxvp 50, 0( %7) \n\t" // a4[0] + "lxvp 52, 0( %8) \n\t" // a5[0] + "lxvp 54, 0( %9) \n\t" // a6[0] + "lxvp 56, 0( %10) \n\t" // a7[0] + + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "lxvpx 50, %7, %11 \n\t" // a4[0] + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "lxvpx 52, %8, %11 \n\t" // a5[0] + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "lxvpx 54, %9, %11 \n\t" // a6[0] + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "lxvpx 56, %10, %11 \n\t" // a7[0] + "addi %11, %11, 32 \n\t" + + "stxvp 36, 0( %2) \n\t" // y0, y1 + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "stxvp 36, 0( %2) \n\t" // y0, y1 + + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3), // 6 + "=&b" (a4), // 7 + "=&b" (a5), // 8 + "=&b" (a6), // 9 + "=&b" (a7), // 10 + "=b" (tmp) + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 14 + "r" (x), // 15 + "3" (ap), // 16 + "4" (lda) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", + "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c index ad5f1ba0d..aba15ab4e 100644 --- a/kernel/power/dgemv_n_power10.c +++ b/kernel/power/dgemv_n_power10.c @@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -typedef __vector unsigned char vec_t; -typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef __vector_pair __attribute__((aligned(8))) vecp_t; #include "dgemv_n_microk_power10.c" -#define MMA(X, APTR, ACC) \ - rX = (vec_t *) & X; \ - rowA = *((vecp_t*)((void*)&APTR)); \ - __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); - -#define SAVE(ACC, Z) \ - rowC = (v4sf_t *) &y[Z]; \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - result[0][1] = result[1][0]; \ - result[2][1] = result[3][0]; \ - rowC[0] += valpha * result[0]; \ - rowC[1] += valpha * result[2]; - -void -dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, - FLOAT * y, FLOAT alpha) -{ - BLASLONG i, j, tmp; - FLOAT *a0 = a_ptr; - FLOAT *x1 = xo; - vector double valpha = { alpha, alpha }; - v4sf_t *rowC; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[4]; - vecp_t rowA; - vec_t *rX; - tmp = (n / 32) * 32; - for (i = 0; i < tmp; i += 32) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); - __builtin_mma_xxsetaccz (&acc2); - __builtin_mma_xxsetaccz (&acc3); - __builtin_mma_xxsetaccz (&acc4); - __builtin_mma_xxsetaccz (&acc5); - __builtin_mma_xxsetaccz (&acc6); - __builtin_mma_xxsetaccz (&acc7); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i + 0); - SAVE (&acc1, i + 4); - SAVE (&acc2, i + 8); - SAVE (&acc3, i + 12); - SAVE (&acc4, i + 16); - SAVE (&acc5, i + 20); - SAVE (&acc6, i + 24); - SAVE (&acc7, i + 28); - - } - for (i = tmp; i < n; i += 4) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i); - } -} - - #define NBMAX 4096 #ifndef HAVE_KERNEL_4x4 @@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; - BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda128 = lda << 7; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT *ybuffer; @@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); ybuffer = buffer; - BLASLONG n128 = n >> 7; - n1 = (n - (n128 * 128)) >> 2; - n2 = (n - (n128 * 128)) & 3; + BLASLONG n8 = n >> 3; + n2 = n & 3; m3 = m & 3 ; m1 = m & -4 ; @@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); - a_ptr += lda128; - x_ptr += 128; + dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda8; + x_ptr += 8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); a_ptr += lda4; @@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - FLOAT xbuffer[128] __attribute__ ((aligned (16))); BLASLONG j; - for ( j = 0; j < 128 ; j++) + for ( j = 0; j < 8 ; j++) { xbuffer[j] = x_ptr[0]; x_ptr += inc_x; } - dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); - a_ptr += lda128; + dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c new file mode 100644 index 000000000..6ede1dcdd --- /dev/null +++ b/kernel/power/saxpy_microk_power10.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void saxpy_kernel_64(long n, float *x, float *y, float alpha) +{ + __vector float t0 = {alpha, alpha,alpha, alpha}; + + __asm__ + ( + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + "lxvp 48, 128(%2) \n\t" + "lxvp 50, 160(%2) \n\t" + "lxvp 52, 192(%2) \n\t" + "lxvp 54, 224(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + "lxvp 36, 256(%3) \n\t" + "lxvp 38, 288(%3) \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "lxvp 44, 320(%3) \n\t" + "lxvp 46, 352(%3) \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + + "lxvp 48, 128(%2) \n\t" + "stxvp 56, 128(%3) \n\t" + + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "lxvp 50, 160(%2) \n\t" + "stxvp 58, 160(%3) \n\t" + + "lxvp 56, 384(%3) \n\t" + "lxvp 58, 416(%3) \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + + "lxvp 52, 192(%2) \n\t" + "stxvp 60, 192(%3) \n\t" + + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + + "lxvp 54, 224(%2) \n\t" + "stxvp 62, 224(%3) \n\t" + + "lxvp 60, 448(%3) \n\t" + "lxvp 62, 480(%3) \n\t" + + "addi %2, %2, 256 \n\t" + "addi %3, %3, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + "stxvp 56, 128(%3) \n\t" + "stxvp 58, 160(%3) \n\t" + "stxvp 60, 192(%3) \n\t" + "stxvp 62, 224(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "wa" (t0), // 4 + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); + +} + + diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c new file mode 100644 index 000000000..4a13c1f88 --- /dev/null +++ b/kernel/power/saxpy_power10.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "saxpy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_8 +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 ) + saxpy_kernel_64(n1, &x[i], &y[i], da); + + i += n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c new file mode 100644 index 000000000..2f028c5a0 --- /dev/null +++ b/kernel/power/sdot_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static float sdot_kernel_16 (long n, float *x, float *y) +{ + float dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=f" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c new file mode 100644 index 000000000..b61f0a90d --- /dev/null +++ b/kernel/power/sdot_power10.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "sdot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +#if defined (DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + +#if defined (DSDOT) + double mydot = 0.0; + FLOAT asmdot = 0.0; +#else + FLOAT mydot=0.0; +#endif + BLASLONG n1; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + n1 = n & (BLASLONG)(-32); + + if ( n1 ) +#if defined(DSDOT) + { + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG n2 = 32; + while (i> 3; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) @@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 4); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif CO += 16; } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 8) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 8) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 8) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 3; } - N = (n & 7) >> 2; - for (i1 = 0; i1 < N; i1++) + if (n & 4) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 4) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 2) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; v4sf_t *rowC; @@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; v4sf_t *rowC; @@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; BLASLONG l = 0; @@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) + for (i = 0; i < (m >> 4); i++) { FLOAT *BO; BLASLONG l = 0; @@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 4; BO += temp; CO += 16; - i -= 16; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (16, 1) #endif } - while (i >= 8) + if (m & 8) { FLOAT *BO; BLASLONG l = 0; @@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; BLASLONG l = 0; @@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c new file mode 100644 index 000000000..5ca1603a6 --- /dev/null +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -0,0 +1,1280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + VbS7 = vec_splat(Vb[31], 1); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + c0[6] -= c0[7] * a[62]; + c1[6] -= c1[7] * a[62]; + c2[6] -= c2[7] * a[62]; + c3[6] -= c3[7] * a[62]; + c4[6] -= c4[7] * a[62]; + c5[6] -= c5[7] * a[62]; + c6[6] -= c6[7] * a[62]; + c7[6] -= c7[7] * a[62]; + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + VbS6 = vec_splat(Vb[27], 0); + VbS7 = vec_splat(Vb[27], 1); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]); + c0[4] -= c0[5] * a[44]; + c1[4] -= c1[5] * a[44]; + c2[4] -= c2[5] * a[44]; + c3[4] -= c3[5] * a[44]; + c4[4] -= c4[5] * a[44]; + c5[4] -= c5[5] * a[44]; + c6[4] -= c6[5] * a[44]; + c7[4] -= c7[5] * a[44]; + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]); + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]); + c0[2] -= c0[3] * a[26]; + c1[2] -= c1[3] * a[26]; + c2[2] -= c2[3] * a[26]; + c3[2] -= c3[3] * a[26]; + c4[2] -= c4[3] * a[26]; + c5[2] -= c5[3] * a[26]; + c6[2] -= c6[3] * a[26]; + c7[2] -= c7[3] * a[26]; + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]); + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + c0[0] -= c0[1] * a[8]; + c1[0] -= c1[1] * a[8]; + c2[0] -= c2[1] * a[8]; + c3[0] -= c3[1] * a[8]; + c4[0] -= c4[1] * a[8]; + c5[0] -= c5[1] * a[8]; + c6[0] -= c6[1] * a[8]; + c7[0] -= c7[1] * a[8]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); + VbS0 = vec_splat(Vb[30], 0); + VbS1 = vec_splat(Vb[30], 1); + VbS2 = vec_splat(Vb[30], 2); + VbS3 = vec_splat(Vb[30], 3); + VbS4 = vec_splat(Vb[31], 0); + VbS5 = vec_splat(Vb[31], 1); + VbS6 = vec_splat(Vb[31], 2); + VbS7 = vec_splat(Vb[31], 3); + Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]); + c0[12] -= b[120] * a[252]; + c0[13] -= b[120] * a[253]; + c0[14] -= b[120] * a[254]; + c1[12] -= b[121] * a[252]; + c1[13] -= b[121] * a[253]; + c1[14] -= b[121] * a[254]; + c2[12] -= b[122] * a[252]; + c2[13] -= b[122] * a[253]; + c2[14] -= b[122] * a[254]; + c3[12] -= b[123] * a[252]; + c3[13] -= b[123] * a[253]; + c3[14] -= b[123] * a[254]; + c4[12] -= b[124] * a[252]; + c4[13] -= b[124] * a[253]; + c4[14] -= b[124] * a[254]; + c5[12] -= b[125] * a[252]; + c5[13] -= b[125] * a[253]; + c5[14] -= b[125] * a[254]; + c6[12] -= b[126] * a[252]; + c6[13] -= b[126] * a[253]; + c6[14] -= b[126] * a[254]; + c7[12] -= b[127] * a[252]; + c7[13] -= b[127] * a[253]; + c7[14] -= b[127] * a[254]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[28], 2); + VbS3 = vec_splat(Vb[28], 3); + VbS4 = vec_splat(Vb[29], 0); + VbS5 = vec_splat(Vb[29], 1); + VbS6 = vec_splat(Vb[29], 2); + VbS7 = vec_splat(Vb[29], 3); + Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]); + c0[12] -= b[112] * a[236]; + c0[13] -= b[112] * a[237]; + c1[12] -= b[113] * a[236]; + c1[13] -= b[113] * a[237]; + c2[12] -= b[114] * a[236]; + c2[13] -= b[114] * a[237]; + c3[12] -= b[115] * a[236]; + c3[13] -= b[115] * a[237]; + c4[12] -= b[116] * a[236]; + c4[13] -= b[116] * a[237]; + c5[12] -= b[117] * a[236]; + c5[13] -= b[117] * a[237]; + c6[12] -= b[118] * a[236]; + c6[13] -= b[118] * a[237]; + c7[12] -= b[119] * a[236]; + c7[13] -= b[119] * a[237]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + VbS0 = vec_splat(Vb[26], 0); + VbS1 = vec_splat(Vb[26], 1); + VbS2 = vec_splat(Vb[26], 2); + VbS3 = vec_splat(Vb[26], 3); + VbS4 = vec_splat(Vb[27], 0); + VbS5 = vec_splat(Vb[27], 1); + VbS6 = vec_splat(Vb[27], 2); + VbS7 = vec_splat(Vb[27], 3); + Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]); + c0[12] -= b[104] * a[220]; + c1[12] -= b[105] * a[220]; + c2[12] -= b[106] * a[220]; + c3[12] -= b[107] * a[220]; + c4[12] -= b[108] * a[220]; + c5[12] -= b[109] * a[220]; + c6[12] -= b[110] * a[220]; + c7[12] -= b[111] * a[220]; + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[24], 2); + VbS3 = vec_splat(Vb[24], 3); + VbS4 = vec_splat(Vb[25], 0); + VbS5 = vec_splat(Vb[25], 1); + VbS6 = vec_splat(Vb[25], 2); + VbS7 = vec_splat(Vb[25], 3); + Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]); + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]); + c0[ 8] -= b[88] * a[184]; + c0[ 9] -= b[88] * a[185]; + c0[10] -= b[88] * a[186]; + c1[ 8] -= b[89] * a[184]; + c1[ 9] -= b[89] * a[185]; + c1[10] -= b[89] * a[186]; + c2[ 8] -= b[90] * a[184]; + c2[ 9] -= b[90] * a[185]; + c2[10] -= b[90] * a[186]; + c3[ 8] -= b[91] * a[184]; + c3[ 9] -= b[91] * a[185]; + c3[10] -= b[91] * a[186]; + c4[ 8] -= b[92] * a[184]; + c4[ 9] -= b[92] * a[185]; + c4[10] -= b[92] * a[186]; + c5[ 8] -= b[93] * a[184]; + c5[ 9] -= b[93] * a[185]; + c5[10] -= b[93] * a[186]; + c6[ 8] -= b[94] * a[184]; + c6[ 9] -= b[94] * a[185]; + c6[10] -= b[94] * a[186]; + c7[ 8] -= b[95] * a[184]; + c7[ 9] -= b[95] * a[185]; + c7[10] -= b[95] * a[186]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]); + c0[8] -= b[80] * a[168]; + c0[9] -= b[80] * a[169]; + c1[8] -= b[81] * a[168]; + c1[9] -= b[81] * a[169]; + c2[8] -= b[82] * a[168]; + c2[9] -= b[82] * a[169]; + c3[8] -= b[83] * a[168]; + c3[9] -= b[83] * a[169]; + c4[8] -= b[84] * a[168]; + c4[9] -= b[84] * a[169]; + c5[8] -= b[85] * a[168]; + c5[9] -= b[85] * a[169]; + c6[8] -= b[86] * a[168]; + c6[9] -= b[86] * a[169]; + c7[8] -= b[87] * a[168]; + c7[9] -= b[87] * a[169]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]); + c0[8] -= b[72] * a[152]; + c1[8] -= b[73] * a[152]; + c2[8] -= b[74] * a[152]; + c3[8] -= b[75] * a[152]; + c4[8] -= b[76] * a[152]; + c5[8] -= b[77] * a[152]; + c6[8] -= b[78] * a[152]; + c7[8] -= b[79] * a[152]; + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]); + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + c0[4] -= b[56] * a[116]; + c0[5] -= b[56] * a[117]; + c0[6] -= b[56] * a[118]; + c1[4] -= b[57] * a[116]; + c1[5] -= b[57] * a[117]; + c1[6] -= b[57] * a[118]; + c2[4] -= b[58] * a[116]; + c2[5] -= b[58] * a[117]; + c2[6] -= b[58] * a[118]; + c3[4] -= b[59] * a[116]; + c3[5] -= b[59] * a[117]; + c3[6] -= b[59] * a[118]; + c4[4] -= b[60] * a[116]; + c4[5] -= b[60] * a[117]; + c4[6] -= b[60] * a[118]; + c5[4] -= b[61] * a[116]; + c5[5] -= b[61] * a[117]; + c5[6] -= b[61] * a[118]; + c6[4] -= b[62] * a[116]; + c6[5] -= b[62] * a[117]; + c6[6] -= b[62] * a[118]; + c7[4] -= b[63] * a[116]; + c7[5] -= b[63] * a[117]; + c7[6] -= b[63] * a[118]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + c0[4] -= b[48] * a[100]; + c0[5] -= b[48] * a[101]; + c1[4] -= b[49] * a[100]; + c1[5] -= b[49] * a[101]; + c2[4] -= b[50] * a[100]; + c2[5] -= b[50] * a[101]; + c3[4] -= b[51] * a[100]; + c3[5] -= b[51] * a[101]; + c4[4] -= b[52] * a[100]; + c4[5] -= b[52] * a[101]; + c5[4] -= b[53] * a[100]; + c5[5] -= b[53] * a[101]; + c6[4] -= b[54] * a[100]; + c6[5] -= b[54] * a[101]; + c7[4] -= b[55] * a[100]; + c7[5] -= b[55] * a[101]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + c0[4] -= b[40] * a[84]; + c1[4] -= b[41] * a[84]; + c2[4] -= b[42] * a[84]; + c3[4] -= b[43] * a[84]; + c4[4] -= b[44] * a[84]; + c5[4] -= b[45] * a[84]; + c6[4] -= b[46] * a[84]; + c7[4] -= b[47] * a[84]; + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + c0[0] -= b[24] * a[48]; + c0[1] -= b[24] * a[49]; + c0[2] -= b[24] * a[50]; + c1[0] -= b[25] * a[48]; + c1[1] -= b[25] * a[49]; + c1[2] -= b[25] * a[50]; + c2[0] -= b[26] * a[48]; + c2[1] -= b[26] * a[49]; + c2[2] -= b[26] * a[50]; + c3[0] -= b[27] * a[48]; + c3[1] -= b[27] * a[49]; + c3[2] -= b[27] * a[50]; + c4[0] -= b[28] * a[48]; + c4[1] -= b[28] * a[49]; + c4[2] -= b[28] * a[50]; + c5[0] -= b[29] * a[48]; + c5[1] -= b[29] * a[49]; + c5[2] -= b[29] * a[50]; + c6[0] -= b[30] * a[48]; + c6[1] -= b[30] * a[49]; + c6[2] -= b[30] * a[50]; + c7[0] -= b[31] * a[48]; + c7[1] -= b[31] * a[49]; + c7[2] -= b[31] * a[50]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + c0[0] -= b[16] * a[32]; + c0[1] -= b[16] * a[33]; + c1[0] -= b[17] * a[32]; + c1[1] -= b[17] * a[33]; + c2[0] -= b[18] * a[32]; + c2[1] -= b[18] * a[33]; + c3[0] -= b[19] * a[32]; + c3[1] -= b[19] * a[33]; + c4[0] -= b[20] * a[32]; + c4[1] -= b[20] * a[33]; + c5[0] -= b[21] * a[32]; + c5[1] -= b[21] * a[33]; + c6[0] -= b[22] * a[32]; + c6[1] -= b[22] * a[33]; + c7[0] -= b[23] * a[32]; + c7[1] -= b[23] * a[33]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + c0[0] -= b[ 8] * a[16]; + c1[0] -= b[ 9] * a[16]; + c2[0] -= b[10] * a[16]; + c3[0] -= b[11] * a[16]; + c4[0] -= b[12] * a[16]; + c5[0] -= b[13] * a[16]; + c6[0] -= b[14] * a[16]; + c7[0] -= b[15] * a[16]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c new file mode 100644 index 000000000..14ff12fe4 --- /dev/null +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -0,0 +1,1265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[1], 0); + VbS3 = vec_splat(Vb[1], 1); + VbS4 = vec_splat(Vb[2], 0); + VbS5 = vec_splat(Vb[2], 1); + VbS6 = vec_splat(Vb[3], 0); + VbS7 = vec_splat(Vb[3], 1); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= c0[0] * a[1]; + c1[1] -= c1[0] * a[1]; + c2[1] -= c2[0] * a[1]; + c3[1] -= c3[0] * a[1]; + c4[1] -= c4[0] * a[1]; + c5[1] -= c5[0] * a[1]; + c6[1] -= c6[0] * a[1]; + c7[1] -= c7[0] * a[1]; + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[6], 0); + VbS5 = vec_splat(Vb[6], 1); + VbS6 = vec_splat(Vb[7], 0); + VbS7 = vec_splat(Vb[7], 1); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= c0[2] * a[19]; + c1[3] -= c1[2] * a[19]; + c2[3] -= c2[2] * a[19]; + c3[3] -= c3[2] * a[19]; + c4[3] -= c4[2] * a[19]; + c5[3] -= c5[2] * a[19]; + c6[3] -= c6[2] * a[19]; + c7[3] -= c7[2] * a[19]; + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= c0[4] * a[37]; + c1[5] -= c1[4] * a[37]; + c2[5] -= c2[4] * a[37]; + c3[5] -= c3[4] * a[37]; + c4[5] -= c4[4] * a[37]; + c5[5] -= c5[4] * a[37]; + c6[5] -= c6[4] * a[37]; + c7[5] -= c7[4] * a[37]; + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + c0[7] -= c0[6] * a[55]; + c1[7] -= c1[6] * a[55]; + c2[7] -= c2[6] * a[55]; + c3[7] -= c3[6] * a[55]; + c4[7] -= c4[6] * a[55]; + c5[7] -= c5[6] * a[55]; + c6[7] -= c6[6] * a[55]; + c7[7] -= c7[6] * a[55]; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= b[0] * a[ 1]; + c0[2] -= b[0] * a[ 2]; + c0[3] -= b[0] * a[ 3]; + c1[1] -= b[1] * a[ 1]; + c1[2] -= b[1] * a[ 2]; + c1[3] -= b[1] * a[ 3]; + c2[1] -= b[2] * a[ 1]; + c2[2] -= b[2] * a[ 2]; + c2[3] -= b[2] * a[ 3]; + c3[1] -= b[3] * a[ 1]; + c3[2] -= b[3] * a[ 2]; + c3[3] -= b[3] * a[ 3]; + c4[1] -= b[4] * a[ 1]; + c4[2] -= b[4] * a[ 2]; + c4[3] -= b[4] * a[ 3]; + c5[1] -= b[5] * a[ 1]; + c5[2] -= b[5] * a[ 2]; + c5[3] -= b[5] * a[ 3]; + c6[1] -= b[6] * a[ 1]; + c6[2] -= b[6] * a[ 2]; + c6[3] -= b[6] * a[ 3]; + c7[1] -= b[7] * a[ 1]; + c7[2] -= b[7] * a[ 2]; + c7[3] -= b[7] * a[ 3]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + VbS2 = vec_splat(Vb[2], 2); + VbS3 = vec_splat(Vb[2], 3); + VbS4 = vec_splat(Vb[3], 0); + VbS5 = vec_splat(Vb[3], 1); + VbS6 = vec_splat(Vb[3], 2); + VbS7 = vec_splat(Vb[3], 3); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + c0[2] -= b[ 8] * a[18]; + c0[3] -= b[ 8] * a[19]; + c1[2] -= b[ 9] * a[18]; + c1[3] -= b[ 9] * a[19]; + c2[2] -= b[10] * a[18]; + c2[3] -= b[10] * a[19]; + c3[2] -= b[11] * a[18]; + c3[3] -= b[11] * a[19]; + c4[2] -= b[12] * a[18]; + c4[3] -= b[12] * a[19]; + c5[2] -= b[13] * a[18]; + c5[3] -= b[13] * a[19]; + c6[2] -= b[14] * a[18]; + c6[3] -= b[14] * a[19]; + c7[2] -= b[15] * a[18]; + c7[3] -= b[15] * a[19]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + VbS3 = vec_splat(Vb[4], 3); + VbS4 = vec_splat(Vb[5], 0); + VbS5 = vec_splat(Vb[5], 1); + VbS6 = vec_splat(Vb[5], 2); + VbS7 = vec_splat(Vb[5], 3); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= b[16] * a[35]; + c1[3] -= b[17] * a[35]; + c2[3] -= b[18] * a[35]; + c3[3] -= b[19] * a[35]; + c4[3] -= b[20] * a[35]; + c5[3] -= b[21] * a[35]; + c6[3] -= b[22] * a[35]; + c7[3] -= b[23] * a[35]; + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + VbS6 = vec_splat(Vb[7], 2); + VbS7 = vec_splat(Vb[7], 3); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= b[32] * a[69]; + c0[6] -= b[32] * a[70]; + c0[7] -= b[32] * a[71]; + c1[5] -= b[33] * a[69]; + c1[6] -= b[33] * a[70]; + c1[7] -= b[33] * a[71]; + c2[5] -= b[34] * a[69]; + c2[6] -= b[34] * a[70]; + c2[7] -= b[34] * a[71]; + c3[5] -= b[35] * a[69]; + c3[6] -= b[35] * a[70]; + c3[7] -= b[35] * a[71]; + c4[5] -= b[36] * a[69]; + c4[6] -= b[36] * a[70]; + c4[7] -= b[36] * a[71]; + c5[5] -= b[37] * a[69]; + c5[6] -= b[37] * a[70]; + c5[7] -= b[37] * a[71]; + c6[5] -= b[38] * a[69]; + c6[6] -= b[38] * a[70]; + c6[7] -= b[38] * a[71]; + c7[5] -= b[39] * a[69]; + c7[6] -= b[39] * a[70]; + c7[7] -= b[39] * a[71]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + c0[6] -= b[40] * a[86]; + c0[7] -= b[40] * a[87]; + c1[6] -= b[41] * a[86]; + c1[7] -= b[41] * a[87]; + c2[6] -= b[42] * a[86]; + c2[7] -= b[42] * a[87]; + c3[6] -= b[43] * a[86]; + c3[7] -= b[43] * a[87]; + c4[6] -= b[44] * a[86]; + c4[7] -= b[44] * a[87]; + c5[6] -= b[45] * a[86]; + c5[7] -= b[45] * a[87]; + c6[6] -= b[46] * a[86]; + c6[7] -= b[46] * a[87]; + c7[6] -= b[47] * a[86]; + c7[7] -= b[47] * a[87]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]); + c0[7] -= b[48] * a[103]; + c1[7] -= b[49] * a[103]; + c2[7] -= b[50] * a[103]; + c3[7] -= b[51] * a[103]; + c4[7] -= b[52] * a[103]; + c5[7] -= b[53] * a[103]; + c6[7] -= b[54] * a[103]; + c7[7] -= b[55] * a[103]; + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]); + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]); + c0[ 9] -= b[64] * a[137]; + c0[10] -= b[64] * a[138]; + c0[11] -= b[64] * a[139]; + c1[ 9] -= b[65] * a[137]; + c1[10] -= b[65] * a[138]; + c1[11] -= b[65] * a[139]; + c2[ 9] -= b[66] * a[137]; + c2[10] -= b[66] * a[138]; + c2[11] -= b[66] * a[139]; + c3[ 9] -= b[67] * a[137]; + c3[10] -= b[67] * a[138]; + c3[11] -= b[67] * a[139]; + c4[ 9] -= b[68] * a[137]; + c4[10] -= b[68] * a[138]; + c4[11] -= b[68] * a[139]; + c5[ 9] -= b[69] * a[137]; + c5[10] -= b[69] * a[138]; + c5[11] -= b[69] * a[139]; + c6[ 9] -= b[70] * a[137]; + c6[10] -= b[70] * a[138]; + c6[11] -= b[70] * a[139]; + c7[ 9] -= b[71] * a[137]; + c7[10] -= b[71] * a[138]; + c7[11] -= b[71] * a[139]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]); + c0[10] -= b[72] * a[154]; + c0[11] -= b[72] * a[155]; + c1[10] -= b[73] * a[154]; + c1[11] -= b[73] * a[155]; + c2[10] -= b[74] * a[154]; + c2[11] -= b[74] * a[155]; + c3[10] -= b[75] * a[154]; + c3[11] -= b[75] * a[155]; + c4[10] -= b[76] * a[154]; + c4[11] -= b[76] * a[155]; + c5[10] -= b[77] * a[154]; + c5[11] -= b[77] * a[155]; + c6[10] -= b[78] * a[154]; + c6[11] -= b[78] * a[155]; + c7[10] -= b[79] * a[154]; + c7[11] -= b[79] * a[155]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]); + c0[11] -= b[80] * a[171]; + c1[11] -= b[81] * a[171]; + c2[11] -= b[82] * a[171]; + c3[11] -= b[83] * a[171]; + c4[11] -= b[84] * a[171]; + c5[11] -= b[85] * a[171]; + c6[11] -= b[86] * a[171]; + c7[11] -= b[87] * a[171]; + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]); + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + c0[13] -= b[ 96] * a[205]; + c0[14] -= b[ 96] * a[206]; + c0[15] -= b[ 96] * a[207]; + c1[13] -= b[ 97] * a[205]; + c1[14] -= b[ 97] * a[206]; + c1[15] -= b[ 97] * a[207]; + c2[13] -= b[ 98] * a[205]; + c2[14] -= b[ 98] * a[206]; + c2[15] -= b[ 98] * a[207]; + c3[13] -= b[ 99] * a[205]; + c3[14] -= b[ 99] * a[206]; + c3[15] -= b[ 99] * a[207]; + c4[13] -= b[100] * a[205]; + c4[14] -= b[100] * a[206]; + c4[15] -= b[100] * a[207]; + c5[13] -= b[101] * a[205]; + c5[14] -= b[101] * a[206]; + c5[15] -= b[101] * a[207]; + c6[13] -= b[102] * a[205]; + c6[14] -= b[102] * a[206]; + c6[15] -= b[102] * a[207]; + c7[13] -= b[103] * a[205]; + c7[14] -= b[103] * a[206]; + c7[15] -= b[103] * a[207]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + c0[14] -= b[104] * a[222]; + c0[15] -= b[104] * a[223]; + c1[14] -= b[105] * a[222]; + c1[15] -= b[105] * a[223]; + c2[14] -= b[106] * a[222]; + c2[15] -= b[106] * a[223]; + c3[14] -= b[107] * a[222]; + c3[15] -= b[107] * a[223]; + c4[14] -= b[108] * a[222]; + c4[15] -= b[108] * a[223]; + c5[14] -= b[109] * a[222]; + c5[15] -= b[109] * a[223]; + c6[14] -= b[110] * a[222]; + c6[15] -= b[110] * a[223]; + c7[14] -= b[111] * a[222]; + c7[15] -= b[111] * a[223]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + c0[15] -= b[112] * a[239]; + c1[15] -= b[113] * a[239]; + c2[15] -= b[114] * a[239]; + c3[15] -= b[115] * a[239]; + c4[15] -= b[116] * a[239]; + c5[15] -= b[117] * a[239]; + c6[15] -= b[118] * a[239]; + c7[15] -= b[119] * a[239]; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c new file mode 100644 index 000000000..92c26fcc3 --- /dev/null +++ b/kernel/power/trsm_kernel_RN_power10.c @@ -0,0 +1,828 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); + VbS0 = vec_splat(Vb[0], 1); + VbS1 = vec_splat(Vb[1], 0); + VbS2 = vec_splat(Vb[1], 1); + VbS3 = vec_splat(Vb[2], 0); + VbS4 = vec_splat(Vb[2], 1); + VbS5 = vec_splat(Vb[3], 0); + VbS6 = vec_splat(Vb[3], 1); + Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); + Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); + Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); + Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); + Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); + Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); + Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); + Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); + Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); + Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); + Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); + Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); + Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); + Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); + Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); + Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); + Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); + Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); + Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); + Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); + Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); + Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); + Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); + Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); + Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); + Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); + Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); + Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[5], 0); + VbS1 = vec_splat(Vb[5], 1); + VbS2 = vec_splat(Vb[6], 0); + VbS3 = vec_splat(Vb[6], 1); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); + Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); + Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); + Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); + Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); + Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); + Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); + Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); + Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); + Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); + Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); + Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); + Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); + Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); + Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); + Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); + Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); + Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); + Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); + Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); + Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); + Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); + Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); + Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[ 9], 1); + VbS1 = vec_splat(Vb[10], 0); + VbS2 = vec_splat(Vb[10], 1); + VbS3 = vec_splat(Vb[11], 0); + VbS4 = vec_splat(Vb[11], 1); + Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); + Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); + Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); + Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); + Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); + Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); + Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); + Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); + Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); + Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); + Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); + Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); + Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); + Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); + Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); + Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); + Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); + Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); + Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); + Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[15], 0); + VbS3 = vec_splat(Vb[15], 1); + Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); + Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); + Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); + Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); + Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); + Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); + Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); + Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); + Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); + Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); + Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); + Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); + Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); + Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); + Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); + Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[18], 1); + VbS1 = vec_splat(Vb[19], 0); + VbS2 = vec_splat(Vb[19], 1); + Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); + Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); + Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); + Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); + Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); + Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); + Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); + Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); + Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); + Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); + Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); + Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[23], 0); + VbS1 = vec_splat(Vb[23], 1); + Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); + Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); + Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); + Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); + Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); + Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); + Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); + Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[27], 1); + Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); + Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); + Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); + Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + + Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); + Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); + Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); + Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; + Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + + VbS0 = vec_splat(Vb[2], 1); + VbS1 = vec_splat(Vb[2], 2); + VbS2 = vec_splat(Vb[2], 3); + VbS3 = vec_splat(Vb[3], 0); + VbS4 = vec_splat(Vb[3], 1); + VbS5 = vec_splat(Vb[3], 2); + VbS6 = vec_splat(Vb[3], 3); + + Vc1[0] = vec_mul(VbS0, Vc1[0]); + Vc1[1] = vec_mul(VbS0, Vc1[1]); + Vc1[2] = vec_mul(VbS0, Vc1[2]); + Vc1[3] = vec_mul(VbS0, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); + Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); + Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); + Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); + Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); + Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); + Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); + Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); + Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); + Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); + Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); + Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); + + VbS0 = vec_splat(Vb[4], 2); + VbS1 = vec_splat(Vb[4], 3); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[5], 2); + VbS5 = vec_splat(Vb[5], 3); + + Vc2[0] = vec_mul(VbS0, Vc2[0]); + Vc2[1] = vec_mul(VbS0, Vc2[1]); + Vc2[2] = vec_mul(VbS0, Vc2[2]); + Vc2[3] = vec_mul(VbS0, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); + Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); + Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); + Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); + Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); + Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); + Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); + Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); + Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); + Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); + + VbS0 = vec_splat(Vb[6], 3); + VbS1 = vec_splat(Vb[7], 0); + VbS2 = vec_splat(Vb[7], 1); + VbS3 = vec_splat(Vb[7], 2); + VbS4 = vec_splat(Vb[7], 3); + + Vc3[0] = vec_mul(VbS0, Vc3[0]); + Vc3[1] = vec_mul(VbS0, Vc3[1]); + Vc3[2] = vec_mul(VbS0, Vc3[2]); + Vc3[3] = vec_mul(VbS0, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); + Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); + Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); + Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); + Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); + Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); + Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); + Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); + + VbS0 = vec_splat(Vb[9], 0); + VbS1 = vec_splat(Vb[9], 1); + VbS2 = vec_splat(Vb[9], 2); + VbS3 = vec_splat(Vb[9], 3); + + Vc4[0] = vec_mul(VbS0, Vc4[0]); + Vc4[1] = vec_mul(VbS0, Vc4[1]); + Vc4[2] = vec_mul(VbS0, Vc4[2]); + Vc4[3] = vec_mul(VbS0, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); + Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); + Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); + Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); + Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); + Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); + + VbS0 = vec_splat(Vb[11], 1); + VbS1 = vec_splat(Vb[11], 2); + VbS2 = vec_splat(Vb[11], 3); + + Vc5[0] = vec_mul(VbS0, Vc5[0]); + Vc5[1] = vec_mul(VbS0, Vc5[1]); + Vc5[2] = vec_mul(VbS0, Vc5[2]); + Vc5[3] = vec_mul(VbS0, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); + Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); + Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); + Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); + + VbS0 = vec_splat(Vb[13], 2); + VbS1 = vec_splat(Vb[13], 3); + + Vc6[0] = vec_mul(VbS0, Vc6[0]); + Vc6[1] = vec_mul(VbS0, Vc6[1]); + Vc6[2] = vec_mul(VbS0, Vc6[2]); + Vc6[3] = vec_mul(VbS0, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); + + VbS0 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS0, Vc7[0]); + Vc7[1] = vec_mul(VbS0, Vc7[1]); + Vc7[2] = vec_mul(VbS0, Vc7[2]); + Vc7[3] = vec_mul(VbS0, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c new file mode 100644 index 000000000..529590f37 --- /dev/null +++ b/kernel/power/trsm_kernel_RT_power10.c @@ -0,0 +1,855 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); + Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); + Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); + Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); + Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[4], 0); + Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS7, Vc7[0]); + Vc7[1] = vec_mul(VbS7, Vc7[1]); + Vc7[2] = vec_mul(VbS7, Vc7[2]); + Vc7[3] = vec_mul(VbS7, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + + Vc6[0] = vec_mul(VbS6, Vc6[0]); + Vc6[1] = vec_mul(VbS6, Vc6[1]); + Vc6[2] = vec_mul(VbS6, Vc6[2]); + Vc6[3] = vec_mul(VbS6, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + + Vc5[0] = vec_mul(VbS5, Vc5[0]); + Vc5[1] = vec_mul(VbS5, Vc5[1]); + Vc5[2] = vec_mul(VbS5, Vc5[2]); + Vc5[3] = vec_mul(VbS5, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + + Vc4[0] = vec_mul(VbS4, Vc4[0]); + Vc4[1] = vec_mul(VbS4, Vc4[1]); + Vc4[2] = vec_mul(VbS4, Vc4[2]); + Vc4[3] = vec_mul(VbS4, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + + Vc3[0] = vec_mul(VbS3, Vc3[0]); + Vc3[1] = vec_mul(VbS3, Vc3[1]); + Vc3[2] = vec_mul(VbS3, Vc3[2]); + Vc3[3] = vec_mul(VbS3, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + + Vc2[0] = vec_mul(VbS2, Vc2[0]); + Vc2[1] = vec_mul(VbS2, Vc2[1]); + Vc2[2] = vec_mul(VbS2, Vc2[2]); + Vc2[3] = vec_mul(VbS2, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + + Vc1[0] = vec_mul(VbS1, Vc1[0]); + Vc1[1] = vec_mul(VbS1, Vc1[1]); + Vc1[2] = vec_mul(VbS1, Vc1[2]); + Vc1[3] = vec_mul(VbS1, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + + VbS0 = vec_splat(Vb[0], 0); + + Vc0[0] = vec_mul(VbS0, Vc0[0]); + Vc0[1] = vec_mul(VbS0, Vc0[1]); + Vc0[2] = vec_mul(VbS0, Vc0[2]); + Vc0[3] = vec_mul(VbS0, Vc0[3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL new file mode 100644 index 000000000..68d68b5f8 --- /dev/null +++ b/kernel/riscv64/KERNEL @@ -0,0 +1,30 @@ +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V new file mode 100644 index 000000000..0da66fa35 --- /dev/null +++ b/kernel/riscv64/KERNEL.C910V @@ -0,0 +1,190 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SAXPBYKERNEL = axpby_vector.c +DAXPBYKERNEL = axpby_vector.c +CAXPBYKERNEL = zaxpby_vector.c +ZAXPBYKERNEL = zaxpby_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = ../generic/trmmkernel_16x4.c +DTRMMKERNEL = ../generic/trmmkernel_8x4.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_16x4_c910v.c +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_8x4_c910v.c +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC new file mode 100644 index 000000000..ea6a8cf21 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -0,0 +1,164 @@ +SAMAXKERNEL = ../riscv64/amax.c +DAMAXKERNEL = ../riscv64/amax.c +CAMAXKERNEL = ../riscv64/zamax.c +ZAMAXKERNEL = ../riscv64/zamax.c + +SAMINKERNEL = ../riscv64/amin.c +DAMINKERNEL = ../riscv64/amin.c +CAMINKERNEL = ../riscv64/zamin.c +ZAMINKERNEL = ../riscv64/zamin.c + +SMAXKERNEL = ../riscv64/max.c +DMAXKERNEL = ../riscv64/max.c + +SMINKERNEL = ../riscv64/min.c +DMINKERNEL = ../riscv64/min.c + +ISAMAXKERNEL = ../riscv64/iamax.c +IDAMAXKERNEL = ../riscv64/iamax.c +ICAMAXKERNEL = ../riscv64/izamax.c +IZAMAXKERNEL = ../riscv64/izamax.c + +ISAMINKERNEL = ../riscv64/iamin.c +IDAMINKERNEL = ../riscv64/iamin.c +ICAMINKERNEL = ../riscv64/izamin.c +IZAMINKERNEL = ../riscv64/izamin.c + +ISMAXKERNEL = ../riscv64/imax.c +IDMAXKERNEL = ../riscv64/imax.c + +ISMINKERNEL = ../riscv64/imin.c +IDMINKERNEL = ../riscv64/imin.c + +SASUMKERNEL = ../riscv64/asum.c +DASUMKERNEL = ../riscv64/asum.c +CASUMKERNEL = ../riscv64/zasum.c +ZASUMKERNEL = ../riscv64/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../riscv64/axpy.c +DAXPYKERNEL = ../riscv64/axpy.c +CAXPYKERNEL = ../riscv64/zaxpy.c +ZAXPYKERNEL = ../riscv64/zaxpy.c + +SCOPYKERNEL = ../riscv64/copy.c +DCOPYKERNEL = ../riscv64/copy.c +CCOPYKERNEL = ../riscv64/zcopy.c +ZCOPYKERNEL = ../riscv64/zcopy.c + +SDOTKERNEL = ../riscv64/dot.c +DDOTKERNEL = ../riscv64/dot.c +CDOTKERNEL = ../riscv64/zdot.c +ZDOTKERNEL = ../riscv64/zdot.c + +SNRM2KERNEL = ../riscv64/nrm2.c +DNRM2KERNEL = ../riscv64/nrm2.c +CNRM2KERNEL = ../riscv64/znrm2.c +ZNRM2KERNEL = ../riscv64/znrm2.c + +SROTKERNEL = ../riscv64/rot.c +DROTKERNEL = ../riscv64/rot.c +CROTKERNEL = ../riscv64/zrot.c +ZROTKERNEL = ../riscv64/zrot.c + +SSCALKERNEL = ../riscv64/scal.c +DSCALKERNEL = ../riscv64/scal.c +CSCALKERNEL = ../riscv64/zscal.c +ZSCALKERNEL = ../riscv64/zscal.c + +SSWAPKERNEL = ../riscv64/swap.c +DSWAPKERNEL = ../riscv64/swap.c +CSWAPKERNEL = ../riscv64/zswap.c +ZSWAPKERNEL = ../riscv64/zswap.c + +SGEMVNKERNEL = ../riscv64/gemv_n.c +DGEMVNKERNEL = ../riscv64/gemv_n.c +CGEMVNKERNEL = ../riscv64/zgemv_n.c +ZGEMVNKERNEL = ../riscv64/zgemv_n.c + +SGEMVTKERNEL = ../riscv64/gemv_t.c +DGEMVTKERNEL = ../riscv64/gemv_t.c +CGEMVTKERNEL = ../riscv64/zgemv_t.c +ZGEMVTKERNEL = ../riscv64/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c new file mode 100644 index 000000000..792e68bd9 --- /dev/null +++ b/kernel/riscv64/amax.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c new file mode 100644 index 000000000..b6aec131e --- /dev/null +++ b/kernel/riscv64/amax_vector.c @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + BLASLONG inc_xv = inc_x * gvl; + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c new file mode 100644 index 000000000..78495a8e3 --- /dev/null +++ b/kernel/riscv64/amin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c new file mode 100644 index 000000000..53243ad56 --- /dev/null +++ b/kernel/riscv64/amin_vector.c @@ -0,0 +1,241 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c new file mode 100644 index 000000000..7ab7484e8 --- /dev/null +++ b/kernel/riscv64/asum_vector.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*8 \n\t"\ + "addi t2, %[PB], 2*8 \n\t"\ + "addi t3, %[PB], 3*8 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "fld ft1, (t1) \n\t"\ + "fld ft2, (t2) \n\t"\ + "fld ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 2*8 \n\t"\ + "addi t5, %[PA], 4*8 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 6*8 \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "fld ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "fld ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "fld ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "fld ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + } + return(dot); +} + + diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c new file mode 100644 index 000000000..ef61b245b --- /dev/null +++ b/kernel/riscv64/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c new file mode 100644 index 000000000..3aa64afc9 --- /dev/null +++ b/kernel/riscv64/iamax_vector.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c new file mode 100644 index 000000000..155292bd5 --- /dev/null +++ b/kernel/riscv64/iamin.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c new file mode 100644 index 000000000..608f19a00 --- /dev/null +++ b/kernel/riscv64/iamin_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c new file mode 100644 index 000000000..5072dd16e --- /dev/null +++ b/kernel/riscv64/imax.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c new file mode 100644 index 000000000..44af7101b --- /dev/null +++ b/kernel/riscv64/imax_vector.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + v_max_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c new file mode 100644 index 000000000..598cba387 --- /dev/null +++ b/kernel/riscv64/imin.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c new file mode 100644 index 000000000..e6e0e9f9f --- /dev/null +++ b/kernel/riscv64/imin_vector.c @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c new file mode 100644 index 000000000..8fe33e95b --- /dev/null +++ b/kernel/riscv64/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c new file mode 100644 index 000000000..62c95d973 --- /dev/null +++ b/kernel/riscv64/izamax_vector.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element greater than v_max + mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + maxf = vx0[0]; + mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_max = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + FLOAT cur_maxf = vx0[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c new file mode 100644 index 000000000..fb5a0d4cb --- /dev/null +++ b/kernel/riscv64/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c new file mode 100644 index 000000000..38eccf1b5 --- /dev/null +++ b/kernel/riscv64/izamin_vector.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element less than v_min + mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + minf = vx0[0]; + mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_min = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + FLOAT cur_minf = vx0[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c new file mode 100644 index 000000000..2ad956bc0 --- /dev/null +++ b/kernel/riscv64/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c new file mode 100644 index 000000000..4ef75452d --- /dev/null +++ b/kernel/riscv64/max_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT maxf=-FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + BLASLONG idx = 0, inc_xv = inc_x * gvl; + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c new file mode 100644 index 000000000..2812fe397 --- /dev/null +++ b/kernel/riscv64/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c new file mode 100644 index 000000000..83c965bfa --- /dev/null +++ b/kernel/riscv64/min_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c new file mode 100644 index 000000000..785c0d2f8 --- /dev/null +++ b/kernel/riscv64/nrm2_vector.c @@ -0,0 +1,220 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL16x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL16x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL16x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL16x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t3, t3, 4*4 \n\t" + + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t" + + +#define KERNEL16x2_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t" + + +#define KERNEL16x2_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t" + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c new file mode 100644 index 000000000..9377bf4b9 --- /dev/null +++ b/kernel/riscv64/swap_vector.c @@ -0,0 +1,173 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jy += inc_y; + a_ptr += lda; + } + }else if(inc_y == 1){ + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + inc_xv = inc_x * gvl; + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + jx += inc_x; + a_ptr += lda; + } + }else{ + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c new file mode 100644 index 000000000..b5a0c96e9 --- /dev/null +++ b/kernel/riscv64/symv_U.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if( m != offset ) + printf("Symv_U: m=%d offset=%d\n",m,offset); +#endif + + BLASLONG m1 = m - offset; + + jx = m1 * inc_x; + jy = m1 * inc_y; + + for (j=m1; j 0){ + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jy += inc_y; + } + }else if(inc_y == 1){ + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + } + }else{ + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} + diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c new file mode 100644 index 000000000..a39bd7821 --- /dev/null +++ b/kernel/riscv64/zamax.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c new file mode 100644 index 000000000..a6c742b14 --- /dev/null +++ b/kernel/riscv64/zamax_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max = VFMVVF_FLOAT(0, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i maxf) + maxf = v_max[0]; + } + return(maxf); +} diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c new file mode 100644 index 000000000..02eab3e75 --- /dev/null +++ b/kernel/riscv64/zamin.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c new file mode 100644 index 000000000..44a7cf1dc --- /dev/null +++ b/kernel/riscv64/zamin_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c new file mode 100644 index 000000000..d9fa88971 --- /dev/null +++ b/kernel/riscv64/zasum_vector.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c new file mode 100644 index 000000000..6fe12c76c --- /dev/null +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLSEV_FLOAT vlsev_float32xm4 +#define VSSEV_FLOAT vssev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLSEV_FLOAT vlsev_float64xm4 +#define VSSEV_FLOAT vssev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c new file mode 100644 index 000000000..fc1c8b54a --- /dev/null +++ b/kernel/riscv64/znrm2.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c new file mode 100644 index 000000000..b0ebfa5f4 --- /dev/null +++ b/kernel/riscv64/znrm2_vector.c @@ -0,0 +1,278 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); +// if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c new file mode 100644 index 000000000..b655a968c --- /dev/null +++ b/kernel/riscv64/zswap_vector.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG n2 = n * 2; + if(gvl <= n2/2){ + for(i=0,j=0; i #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 3f79646e0..fbe531417 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -12,6 +12,8 @@ typedef __m256d v_f64; ***************************/ #define v_add_f32 _mm256_add_ps #define v_add_f64 _mm256_add_pd +#define v_sub_f32 _mm256_sub_ps +#define v_sub_f64 _mm256_sub_pd #define v_mul_f32 _mm256_mul_ps #define v_mul_f64 _mm256_mul_pd @@ -19,12 +21,20 @@ typedef __m256d v_f64; // multiply and add, a*b + c #define v_muladd_f32 _mm256_fmadd_ps #define v_muladd_f64 _mm256_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm256_fmsub_ps + #define v_mulsub_f64 _mm256_fmsub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // !HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index f00af53e9..8f38eedd9 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -12,11 +12,16 @@ typedef __m512d v_f64; ***************************/ #define v_add_f32 _mm512_add_ps #define v_add_f64 _mm512_add_pd +#define v_sub_f32 _mm512_sub_ps +#define v_sub_f64 _mm512_sub_pd #define v_mul_f32 _mm512_mul_ps #define v_mul_f64 _mm512_mul_pd // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps #define v_muladd_f64 _mm512_fmadd_pd +// multiply and subtract, a*b - c +#define v_mulsub_f32 _mm512_fmsub_ps +#define v_mulsub_f64 _mm512_fmsub_pd BLAS_FINLINE float v_sum_f32(v_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 22cef10ca..cd44599fe 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -18,6 +18,8 @@ typedef float32x4_t v_f32; ***************************/ #define v_add_f32 vaddq_f32 #define v_add_f64 vaddq_f64 +#define v_sub_f32 vsubq_f32 +#define v_sub_f64 vsubq_f64 #define v_mul_f32 vmulq_f32 #define v_mul_f64 vmulq_f64 @@ -26,16 +28,24 @@ typedef float32x4_t v_f32; // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vfmaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vfmaq_f32(vnegq_f32(c), a, b); } #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vmlaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vmlaq_f32(vnegq_f32(c), a, b); } #endif // FUSED F64 #if V_SIMD_F64 BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return vfmaq_f64(c, a, b); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return vfmaq_f64(vnegq_f64(c), a, b); } #endif // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 06a3fe78b..6a542072e 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -12,22 +12,35 @@ typedef __m128d v_f64; ***************************/ #define v_add_f32 _mm_add_ps #define v_add_f64 _mm_add_pd +#define v_sub_f32 _mm_sub_ps +#define v_sub_f64 _mm_sub_pd #define v_mul_f32 _mm_mul_ps #define v_mul_f64 _mm_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm_fmadd_ps #define v_muladd_f64 _mm_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_fmsub_ps + #define v_mulsub_f64 _mm_fmsub_pd #elif defined(HAVE_FMA4) // multiply and add, a*b + c #define v_muladd_f32 _mm_macc_ps #define v_muladd_f64 _mm_macc_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_msub_ps + #define v_mulsub_f64 _mm_msub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc index 2e8319ce5..1a2e9671a 100644 --- a/kernel/sparc/KERNEL.sparc +++ b/kernel/sparc/KERNEL.sparc @@ -54,3 +54,13 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + + +SDOTKERNEL = ../generic/dot.c +SDSDOTKERNEL = ../generic/dot.c +DSDOTKERNEL = ../generic/dot.c +DDOTKERNEL = ../generic/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 855e1ff8c..b92f480e9 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -384,6 +384,14 @@ endif GEMVDEP = ../l2param.h +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = sbgemv_t.c +endif + ifndef SGEMVNKERNEL SGEMVNKERNEL = sgemv_n.c endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index b979fc0ae..81eaf96ac 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 9b8b84c30..3d71584fe 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,3 +27,6 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c + +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h new file mode 100644 index 000000000..1014ecc4d --- /dev/null +++ b/kernel/x86_64/bf16_common_macros.h @@ -0,0 +1,795 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#ifndef __BF16_COMMON_MACROS +#define __BF16_COMMON_MACROS + +#include + +#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ + reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ + reg256##_1 = _mm512_castps512_ps256(reg512##_1); + + +#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ + regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); + +#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]); + +#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n) \ + reg = _mm512_loadu_si512(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ + reg = _mm256_loadu_si256(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ + reg = _mm_loadu_si128(x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ + reg = _mm512_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask) \ + reg = _mm256_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask) \ + reg = _mm_maskz_loadu_epi16(mask, x + idx_n); + + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31 +*/ +#define BF16_INTERLEAVE_8x32(regArray) \ + regArray##_8 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15 +*/ +#define BF16_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm256_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm256_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm256_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm256_unpackhi_epi64(regArray##_14, regArray##_15); + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 +*/ +#define BF16_INTERLEAVE_4x32(regArray) \ + regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 +*/ +#define BF16_INTERLEAVE_4x16(regArray) \ + regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for x with 32 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31 +*/ +#define BF16_INTERLEAVE_1x32(regArray) \ + regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3); + + +/* 2-step interleave for x with 16 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15 +*/ +#define BF16_INTERLEAVE_1x16(regArray) \ + regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3); + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers + |a0|a1|...|a14|a15|i0|i1|...|i14|i15| + |b0|b1|...|b14|b15|j0|j1|...|j14|j15| + |c0|c1|...|c14|c15|k0|k1|...|k14|k15| + |d0|d1|...|d14|d15|l0|l1|...|l14|l15| + |e0|e1|...|e14|e15|m0|m1|...|m14|m15| + |f0|f1|...|f14|f15|n0|n1|...|n14|n15| + |g0|g1|...|g14|g15|o0|o1|...|o14|o15| + |h0|h1|...|h14|h15|p0|p1|...|p14|p15| +*/ +#define BF16_INTERLEAVE256_8x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0xee); \ + regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44); \ + regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee); \ + regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44); \ + regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee); + + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers + |a0|a1|...|a14|a15|e0|e1|...|e14|e15| + |b0|b1|...|b14|b15|f0|f1|...|f14|f15| + |c0|c1|...|c14|c15|g0|g1|...|g14|g15| + |d0|d1|...|d14|d15|h0|h1|...|h14|h15| +*/ +#define BF16_INTERLEAVE256_4x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0xee); + + +#define BF16_PERMUTE_8x32(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7); + + +#define BF16_PERMUTE_8x32_2(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7); + + +#define BF16_PERMUTE_4x32(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3); + + +#define BF16_PERMUTE_4x32_2(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3); + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3); + + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray); \ + accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray); \ + accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray); \ + accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray); \ + accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray); \ + accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray); \ + accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_1x32(accumArray, matArray, xArray) \ + accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 16 elements per row + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray); \ + accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray); \ + accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray); \ + accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray); \ + accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray); \ + accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray); \ + accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray); + + +/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13| + |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13| + |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13| + |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13| + |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15| + |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15| + |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15| + |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12| + |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13| + |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12| + |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13| + |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14| + |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15| + |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14| + |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15| +*/ +#define FP32_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm512_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \ + regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15); + +#define FP32_INTERLEAVE_8x16_ARRAY(regArray) \ + regArray[8] = _mm512_unpacklo_ps(regArray[0], regArray[1]); \ + regArray[9] = _mm512_unpacklo_ps(regArray[2], regArray[3]); \ + regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]); \ + regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]); \ + regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]); \ + regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]); \ + regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]); \ + regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]); \ + \ + regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \ + regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]); + +/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5| + |c0|d0|c1|d1|c4|d4|c5|d5| + |e0|f0|e1|f1|e4|f4|e5|f5| + |g0|h0|g1|h1|g4|h4|g5|h5| + |a2|b2|a3|b3|a6|b6|a7|b7| + |c2|d2|c3|d3|c6|d6|c7|d7| + |e2|f2|e3|f3|e6|f6|e7|f7| + |g2|h2|g3|h3|g6|h6|g7|h7| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4| + |a1|b1|c1|d1|a5|b5|c5|d5| + |e0|f0|g0|h0|e4|f4|g4|h4| + |e1|f1|g1|h1|e5|f5|g5|h5| + |a2|b2|c2|d2|a6|b6|c6|d6| + |a3|b3|c3|d3|a7|b7|c7|d7| + |e2|f2|g2|h2|e6|f6|g6|h6| + |e3|f3|g3|h3|e7|f7|g7|h7| +*/ +#define FP32_INTERLEAVE_8x8(regArray) \ + regArray##_8 = _mm256_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \ + regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15); + + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x16(regArray) \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6); + +#define FP32_ACCUM2_8x16_ARRAY(regArray) \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[1]); \ + regArray[2] = _mm512_add_ps(regArray[2], regArray[3]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[5]); \ + regArray[6] = _mm512_add_ps(regArray[6], regArray[7]); \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[2]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[6]); + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x8(regArray) \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6); + + +/* Store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr))); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr))); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr))); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr))); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr))); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr))); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Masked store 16 (alpha * result) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Store 8 (alpha * result) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Masked store 8 (alpha * result) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Store 4 (alpha * result) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Masked store 4 (alpha * result) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Store 16 result to y +*/ +#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 result to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 result to y +*/ +#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 result to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 result to y +*/ +#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 result to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + +#endif diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c new file mode 100644 index 000000000..a1bd76f33 --- /dev/null +++ b/kernel/x86_64/casum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "casum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CASUM_KERNEL +static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + } + + while (i < n) { + sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = casum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c new file mode 100644 index 000000000..d51929f9f --- /dev/null +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -0,0 +1,349 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + __m128 abs_mask1; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + x04 = _mm_and_ps(x04, abs_mask1); + x05 = _mm_and_ps(x05, abs_mask1); + x06 = _mm_and_ps(x06, abs_mask1); + x07 = _mm_and_ps(x07, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + x00 = _mm_and_ps(x00, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (ABS_K(x1[0]) + ABS_K(x1[1])); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + __m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff); + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 8a40ea4b9..ddec21383 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) } #endif - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = dasum_kernel(n, x); - } + } else { n *= inc_x; - - while(i < n) { + while (i < n) { sumf += ABS_K(x[i]); i += inc_x; } @@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) return(sumf); } +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; + FLOAT * dummy_b; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} + diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c new file mode 100644 index 000000000..66e9ff907 --- /dev/null +++ b/kernel/x86_64/drot.c @@ -0,0 +1,205 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "drot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "drot_microk_haswell-2.c" +#endif + +#ifndef HAVE_DROT_KERNEL +#include "../simd/intrin.h" + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; +#if V_SIMD_F64 && V_SIMD > 256 + const int vstep = v_nlanes_f64; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f64 __c = v_setall_f64(c); + v_f64 __s = v_setall_f64(s); + v_f64 vx0, vx1, vx2, vx3; + v_f64 vy0, vy1, vy2, vy3; + v_f64 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f64(x + i); + vx1 = v_loadu_f64(x + i + vstep); + vx2 = v_loadu_f64(x + i + vstep * 2); + vx3 = v_loadu_f64(x + i + vstep * 3); + vy0 = v_loadu_f64(y + i); + vy1 = v_loadu_f64(y + i + vstep); + vy2 = v_loadu_f64(y + i + vstep * 2); + vy3 = v_loadu_f64(y + i + vstep * 3); + + vt0 = v_mul_f64(__s, vy0); + vt1 = v_mul_f64(__s, vy1); + vt2 = v_mul_f64(__s, vy2); + vt3 = v_mul_f64(__s, vy3); + + vt0 = v_muladd_f64(__c, vx0, vt0); + vt1 = v_muladd_f64(__c, vx1, vt1); + vt2 = v_muladd_f64(__c, vx2, vt2); + vt3 = v_muladd_f64(__c, vx3, vt3); + + v_storeu_f64(x + i, vt0); + v_storeu_f64(x + i + vstep, vt1); + v_storeu_f64(x + i + vstep * 2, vt2); + v_storeu_f64(x + i + vstep * 3, vt3); + + vt0 = v_mul_f64(__s, vx0); + vt1 = v_mul_f64(__s, vx1); + vt2 = v_mul_f64(__s, vx2); + vt3 = v_mul_f64(__s, vx3); + + vt0 = v_mulsub_f64(__c, vy0, vt0); + vt1 = v_mulsub_f64(__c, vy1, vt1); + vt2 = v_mulsub_f64(__c, vy2, vt2); + vt3 = v_mulsub_f64(__c, vy3, vt3); + + v_storeu_f64(y + i, vt0); + v_storeu_f64(y + i + vstep, vt1); + v_storeu_f64(y + i + vstep * 2, vt2); + v_storeu_f64(y + i + vstep * 3, vt3); + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f64(x + i); + vy0 = v_loadu_f64(y + i); + + vt0 = v_mul_f64(__s, vy0); + vt0 = v_muladd_f64(__c, vx0, vt0); + v_storeu_f64(x + i, vt0); + + vt0 = v_mul_f64(__s, vx0); + vt0 = v_mulsub_f64(__c, vy0, vt0); + v_storeu_f64(y + i, vt0); + } +#else + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } +#endif + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + drot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((FLOAT *)args->alpha)[0], + ((FLOAT *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c new file mode 100644 index 000000000..72a87696e --- /dev/null +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_4 = n&(~3); + BLASLONG tail_index_16 = n&(~15); + + __m256d c_256, s_256; + if (n >= 4) { + c_256 = _mm256_set1_pd(c); + s_256 = _mm256_set1_pd(s); + } + + __m256d x0, x1, x2, x3; + __m256d y0, y1, y2, y3; + __m256d t0, t1, t2, t3; + + for (i = 0; i < tail_index_16; i += 16) { + x0 = _mm256_loadu_pd(&x[i + 0]); + x1 = _mm256_loadu_pd(&x[i + 4]); + x2 = _mm256_loadu_pd(&x[i + 8]); + x3 = _mm256_loadu_pd(&x[i +12]); + y0 = _mm256_loadu_pd(&y[i + 0]); + y1 = _mm256_loadu_pd(&y[i + 4]); + y2 = _mm256_loadu_pd(&y[i + 8]); + y3 = _mm256_loadu_pd(&y[i +12]); + + t0 = _mm256_mul_pd(s_256, y0); + t1 = _mm256_mul_pd(s_256, y1); + t2 = _mm256_mul_pd(s_256, y2); + t3 = _mm256_mul_pd(s_256, y3); + + t0 = _mm256_fmadd_pd(c_256, x0, t0); + t1 = _mm256_fmadd_pd(c_256, x1, t1); + t2 = _mm256_fmadd_pd(c_256, x2, t2); + t3 = _mm256_fmadd_pd(c_256, x3, t3); + + _mm256_storeu_pd(&x[i + 0], t0); + _mm256_storeu_pd(&x[i + 4], t1); + _mm256_storeu_pd(&x[i + 8], t2); + _mm256_storeu_pd(&x[i +12], t3); + + t0 = _mm256_mul_pd(s_256, x0); + t1 = _mm256_mul_pd(s_256, x1); + t2 = _mm256_mul_pd(s_256, x2); + t3 = _mm256_mul_pd(s_256, x3); + + t0 = _mm256_fmsub_pd(c_256, y0, t0); + t1 = _mm256_fmsub_pd(c_256, y1, t1); + t2 = _mm256_fmsub_pd(c_256, y2, t2); + t3 = _mm256_fmsub_pd(c_256, y3, t3); + + _mm256_storeu_pd(&y[i + 0], t0); + _mm256_storeu_pd(&y[i + 4], t1); + _mm256_storeu_pd(&y[i + 8], t2); + _mm256_storeu_pd(&y[i +12], t3); + + } + + for (i = tail_index_16; i < tail_index_4; i += 4) { + x0 = _mm256_loadu_pd(&x[i]); + y0 = _mm256_loadu_pd(&y[i]); + + t0 = _mm256_mul_pd(s_256, y0); + t0 = _mm256_fmadd_pd(c_256, x0, t0); + _mm256_storeu_pd(&x[i], t0); + + t0 = _mm256_mul_pd(s_256, x0); + t0 = _mm256_fmsub_pd(c_256, y0, t0); + _mm256_storeu_pd(&y[i], t0); + } + + for (i = tail_index_4; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c new file mode 100644 index 000000000..4e862e663 --- /dev/null +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -0,0 +1,94 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG n1 = n; + + BLASLONG tail_index_8 = 0; + BLASLONG tail_index_32 = 0; + + __m512d c_512 = _mm512_set1_pd(c); + __m512d s_512 = _mm512_set1_pd(s); + + tail_index_8 = n1 & (~7); + tail_index_32 = n1 & (~31); + + + __m512d x0, x1, x2, x3; + __m512d y0, y1, y2, y3; + __m512d t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm512_loadu_pd(&x[i + 0]); + x1 = _mm512_loadu_pd(&x[i + 8]); + x2 = _mm512_loadu_pd(&x[i +16]); + x3 = _mm512_loadu_pd(&x[i +24]); + y0 = _mm512_loadu_pd(&y[i + 0]); + y1 = _mm512_loadu_pd(&y[i + 8]); + y2 = _mm512_loadu_pd(&y[i +16]); + y3 = _mm512_loadu_pd(&y[i +24]); + + t0 = _mm512_mul_pd(s_512, y0); + t1 = _mm512_mul_pd(s_512, y1); + t2 = _mm512_mul_pd(s_512, y2); + t3 = _mm512_mul_pd(s_512, y3); + + t0 = _mm512_fmadd_pd(c_512, x0, t0); + t1 = _mm512_fmadd_pd(c_512, x1, t1); + t2 = _mm512_fmadd_pd(c_512, x2, t2); + t3 = _mm512_fmadd_pd(c_512, x3, t3); + + _mm512_storeu_pd(&x[i + 0], t0); + _mm512_storeu_pd(&x[i + 8], t1); + _mm512_storeu_pd(&x[i +16], t2); + _mm512_storeu_pd(&x[i +24], t3); + + t0 = _mm512_mul_pd(s_512, x0); + t1 = _mm512_mul_pd(s_512, x1); + t2 = _mm512_mul_pd(s_512, x2); + t3 = _mm512_mul_pd(s_512, x3); + + t0 = _mm512_fmsub_pd(c_512, y0, t0); + t1 = _mm512_fmsub_pd(c_512, y1, t1); + t2 = _mm512_fmsub_pd(c_512, y2, t2); + t3 = _mm512_fmsub_pd(c_512, y3, t3); + + _mm512_storeu_pd(&y[i + 0], t0); + _mm512_storeu_pd(&y[i + 8], t1); + _mm512_storeu_pd(&y[i +16], t2); + _mm512_storeu_pd(&y[i +24], t3); + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm512_loadu_pd(&x[i]); + y0 = _mm512_loadu_pd(&y[i]); + + t0 = _mm512_mul_pd(s_512, y0); + t0 = _mm512_fmadd_pd(c_512, x0, t0); + _mm512_storeu_pd(&x[i], t0); + + t0 = _mm512_mul_pd(s_512, x0); + t0 = _mm512_fmsub_pd(c_512, y0, t0); + _mm512_storeu_pd(&y[i], t0); + } + + if ((n1&7) > 0) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7))); + __m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]); + __m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]); + __m512d temp = _mm512_mul_pd(s_512, tail_y); + temp = _mm512_fmadd_pd(c_512, tail_x, temp); + _mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp); + temp = _mm512_mul_pd(s_512, tail_x); + temp = _mm512_fmsub_pd(c_512, tail_y, temp); + _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp); + } +} +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 36ec4a737..d0cea9bee 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) #endif -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = sasum_kernel(n, x); } else { - n *= inc_x; while(i < n) { sumf += ABS_K(x[i]); i += inc_x; } - } + return (sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT * ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif return(sumf); } diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c new file mode 100644 index 000000000..18e64dc3f --- /dev/null +++ b/kernel/x86_64/sbgemv_n.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_n_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_N_ACCL_KERNEL +static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_m; + float accum = 0.0; + float tmp_x = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG j=0; j= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_N_ACCL_KERNEL 1 +#include "common.h" +#include + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_n_microk_cooperlake_template.c" + +static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y); + } else { // ALPHA != 1.0, need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c new file mode 100644 index 000000000..46e6d0ff9 --- /dev/null +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_128x = m & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + __m512i ZERO512 = _mm512_setzero_si512(); + + unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa); + __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value); + unsigned int blend_lo_mask_value = ((unsigned int)0x55555555); + __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value); + + __m512i M512_EPI32_8 = _mm512_set1_epi32(8); + __m512i idx_base_0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_8); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1); + matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1); + matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2); + matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2); + matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3); + matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0) + BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0) + BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0) + BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0) + BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0) + BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3); + accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3); + accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5); + accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5); + accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7); + accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32) + STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48) + STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64) + STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80) + STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96) + STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112) + } + + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + } + + if (tag_m_32x != m) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); + + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + if ((m-tag_m_32x) > 16) { + STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) + STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) + } else { + STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask) + } + } + + return 0; +} diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c new file mode 100644 index 000000000..22b099116 --- /dev/null +++ b/kernel/x86_64/sbgemv_t.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_t_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_T_ACCL_KERNEL +static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_n; + float accum = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG i=0; i= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_T_ACCL_KERNEL 1 + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_t_microk_cooperlake_template.c" + +static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1 (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2 (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3 (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4 (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5 (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6 (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7 (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8 (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9 (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y); + } + } + } + } else { // ALPHA != 1.0, need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y); + } + } + } + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } + } + } + } else { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + } + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c new file mode 100644 index 000000000..51e681add --- /dev/null +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -0,0 +1,3082 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + +// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i xArray; + __m512 result_0, result_1; +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA +#ifndef ONE_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif +#endif + + __m512i load_idx_lo = _mm512_set_epi16(0, 15, 0, 14, 0, 13, 0, 12, 0, 11, 0, 10, 0, 9, 0, 8,\ + 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0); + __m512i M512_EPI16_16 = _mm512_set1_epi16(16); + __m512i load_idx_hi = _mm512_add_epi16(load_idx_lo, M512_EPI16_16); + + unsigned int interleve_mask_value = ((unsigned int) 0x55555555); + __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value); + + xArray = _mm512_set1_epi16((short) x[0]); + xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num)); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num)); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask) + } else if (tail_num > 8) { + __m256 result256_0 = _mm256_setzero_ps(); + __m256 result256_1 = _mm256_setzero_ps(); + + __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo); + __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1); + __m256i xArray256 = _mm512_castsi512_si256(xArray); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 16 rows with n=1 + __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0); // Expand the low 8 elements + __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0); // Expand the high 8 elements + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256); + + unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask) + } else { + __m128 result128_0 = _mm_setzero_ps(); + __m128 result128_1 = _mm_setzero_ps(); + + __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0); + __m128i M128_EPI16_4 = _mm_set1_epi16(4); + __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4); + + __m128i xArray128 = _mm512_castsi512_si128(xArray); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 8 rows with n=1 + __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0); // Expand the low 4 elements + __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0); // Expand the high 4 elements + + result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128); + result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128); + + if (tail_num > 4) { + unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x) + STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask) + } else { + unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask) + } + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1; + __m512i xArray; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x)); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]); // Load 16 rows as n=2 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]); // Load 16 rows as n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (m - tag_m_32x >= 16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + + tag_m_32x += 16; + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 8) { + result_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask) + } else if (tail_num == 8) { + __m256 result256 = _mm256_setzero_ps(); + + __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_32x) + } else { + __m256 result256 = _mm256_setzero_ps(); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask) + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0| + __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // x2| 0|x2| 0|...|x2| 0| + + __m512i load_idx_base; + __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 0); + + if (tag_m_32x > 0) { + __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6; + + unsigned int idx_blend_mask_value = ((unsigned int)0x80000000); + __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value); + + load_idx01_1st = load_idx_base; + load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16); + load_idx2_1st = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2); + load_idx2_2nd = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2); + load_idx2_2nd = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512()); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]); // Load 10 rows with n=3 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]); // Load 10 rows with n=3 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]); // Load 10 rows with n=3 plus 2 element + + matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1); // Select the first 2 elements for each row + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2); // Select the first 2 elements for each row + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st, matrixArray_1); // Select the third element for each row + matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd, matrixArray_2); // Select the third element for each row + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (tag_m_32x != m) { + __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd; + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6; + __m256 result256_0, result256_1; + + unsigned short idx256_blend_mask_value = ((unsigned short)0x8000); + __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value); + + load256_idx01_1st = _mm512_castsi512_si256(load_idx_base); + load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8)); + load256_idx2_1st = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256()); + + if (m - tag_m_32x > 15) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8) + + tag_m_32x += 16; + } + + if (tag_m_32x != m) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + BLASLONG tail_num = m-tag_m_32x; + + if (tail_num > 10) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else if (tail_num > 5) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows + matrixArray256_2 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]); // Load m-tag_m_32x rows + matrixArray256_1 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } + + unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num))); + __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value); + __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1); + STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask) + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_01, xArray_23, xArray_remix; + __m512 result; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x); // |x0|x1|x2|x3|0|0|0|0| + xArray_01 = _mm512_broadcastd_epi32(xTmp); // |x0|x1|x0|x1|...|x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // |x2|x3|x2|x3|...|x2|x3| + unsigned short blend_mask_value = ((unsigned short)0xff00); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| + + if (tag_m_16x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]); // Load 8 rows with n=4 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]); // Load 8 rows with n=4 + + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1); // |a0|a1|...|h0|h1|i0|i1|...|p0|p1| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1); // |a2|a3|...|h2|h3|i2|i3|...|p2|p3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01); + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23); + + STORE16_COMPLETE_RESULT(result, y+idx_m) + } + } + + if (m - tag_m_16x > 7) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m-tag_m_16x; + if (tail_num != 0) { + result = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask) + } + + return 0; +} + +// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_30x = m - (m%30); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|0|0|0| + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512 result_0, result_1; + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_4 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4| 0|x4| 0|...|x4| 0| + + __m512i M512_EPI16_2 = _mm512_set1_epi16(2); + __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0, 58, 57, 53, 52, 48, 47, 43, 42, + 38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10, 6, 5, 1, 0); + __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39); + __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f); + + __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2); + __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2); + __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2); + + __m512i load_idx4_stage1_1st = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2); + __m512i load_idx4_stage1_2nd = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2); + __m512i load_idx4_stage1_3rd = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2; + __m512i matrixArray_stage2_0, matrixArray_stage2_1; + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + if (tag_m_30x > 0) { + unsigned short blend_mask_value_0 = ((unsigned short)0xf000); + __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0); + unsigned short blend_mask_value_1 = ((unsigned short)0x3f00); + __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1); + for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]); // Load 6 rows with n=5 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]); // Load 6 rows with n=5 + matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]); // Load 6 rows with n=5 + + // Process the 0|1 elements + // Stage 1: Select the 0|1 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 0|1 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01); + + // Process the 2|3 elements + // Stage 1: Select the 2|3 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 2|3 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23); + + // Process the for 4 elements + // Stage 1: Select the 4 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 4 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_4); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_4); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask) + } + } + + if (m - tag_m_30x > 11) { + BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12); + for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask) + tag_m_30x += 12; + } + } + + BLASLONG tail_num = m - tag_m_30x; + if (tail_num > 6) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num))); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5)); + __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]); // Load x rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask) + } else { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_30x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]); // Load 1 rows with n=5 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|x5|0|0| + + if (tag_m_16x > 0) { + __m512 result_0; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i load_idx01_1st = _mm512_set_epi32( 0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i load_idx01_2nd = _mm512_set_epi32(13, 10, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1); + __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1); + + __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1); + __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1); + + unsigned short blend_mask_value = ((unsigned short)0x0400); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register + load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd); + // Set the 11th element to be 0 as 0 is the correct index + load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd); + + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4|x5|x4|x5|...|x4|x5| + + unsigned short permute_mask01_uint = (((unsigned short)0xf800)); + __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint); + unsigned short permute_mask45_uint = (((unsigned short)0xfc00)); + __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2; + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]); // Load 5 rows with n=6 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]); // Load 5 rows with n=6 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]); // Load 5 rows with n=6 plus 2 element + + // Stage 1: interleave for the a..k elements + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + + // Stage 2: interleave for the l..p elements and remix together + matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2); + matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2); + + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1); + __m256i load_idx01_1st = _mm256_set_epi32( 0, 0, 15, 12, 9, 6, 3, 0); + __m256i load_idx01_2nd = _mm256_set_epi32( 5, 2, 0, 0, 0, 0, 0, 0); + + __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1); + __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1); + unsigned char blend_mask_value = ((unsigned char)0x20); + __mmask8 blend_mask = *((__mmask8*) &blend_mask_value); + // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register + load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd); + // Set the 6th element to be 0 as 0 is the correct index + load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd); + + __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1); + __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1); + + unsigned char permute_mask01_uint = (((unsigned char)0xc0)); + __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint); + unsigned char permute_mask45_uint = (((unsigned char)0xe0)); + __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2; + __m256i matrixArray_stage_0; + __m256 result256_0; + + result256_0 = _mm256_setzero_ps(); + + matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + + // Process the 0|1 elements + // Select the 0|1 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01)); + + // Process the 2|3 elements + // Select the 2|3 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23)); + + // Process the for 4 elements + // Select the 4|5 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x) + tag_m_16x += 8; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]); // Load 1 rows with n=6 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14, + 31, 13, 12, 11, 10, 9, 8, 7, 31, 6, 5, 4, 3, 2, 1, 0); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]); // Load 4 rows with n=7 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]); // Load 4 rows with n=7 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2); // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7| + matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3); // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 3: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7)); + __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]); // Load 1 rows with n=7 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]); // Load 4 rows with n=8 + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]); // Load 4 rows with n=8 + matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 2: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4)); + __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_14x = m - (m%14); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| + + if (tag_m_14x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI16_2 = _mm256_set1_epi16(2); + __m256i idx_base_0 = _mm256_set_epi16( 0, 0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10, 9, 1, 0); + __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2); + __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2); + __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2); + __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 7); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|0 |x8| 0| ... |x8| 0| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short blend_mask_value = ((unsigned short)0x3f80); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5); // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_14x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_14x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_12x = m - (m%12); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| + + if (tag_m_12x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_base_0 = _mm256_set_epi32( 0, 0, 26, 21, 16, 10, 5, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1); + __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1); + __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1); + __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 11, 10, 9, 8, 7, 6); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|x9|x8|x9| ... |x8|x9| + + unsigned short blend_mask_value = ((unsigned short)0x0fc0); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]); // Load 3 rows with n=10 + matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]); // Load 3 rows with n=10 + matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]); // Load 3 rows with n=10 + matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]); // Load 3 rows with n=10 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x| + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x| + + // Stage 3: interleave per 256 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5); // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_12x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned char load256_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 load256_mask = *((__mmask8*) &load256_mask_value); + for (BLASLONG i = tag_m_12x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2); + M512_EPI32_5 = _mm512_set1_epi32(5); + + unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff); + __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value); + unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00); + __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value); + unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000); + __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi16( 0, 0, 49, 48, 38, 37, 27, 26, 16, 15, 5, 4, 47, 46, 36, 35, + 25, 24, 14, 13, 3, 2, 45, 44, 34, 33, 23, 22, 12, 11, 1, 0); + idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6); + + idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6); + + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4); + idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|0 |x10|0 | ... |x10|0 | + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]); // Load 2 rows with n=11 plus 1 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x | + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x | + matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1| + matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5; + M512_EPI32_1 = _mm512_set1_epi32(1); + M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1); + M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1); + M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2); + + unsigned short BASE_MASK_10_value = ((unsigned short)0x001f); + __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value); + unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0); + __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value); + unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00); + __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14, 8, 2, 25, 19, 13, 7, 1, 24, 18, 12, 6, 0); + idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3); + + idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3); + + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2); + idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|x11|x10|x11| ... |x10|x11| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]); // Load 2 rows with n=12 plus 4 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 | + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 | + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 | + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 | + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + + +// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]); // Load 1 rows with n=13 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + __m512i shift_idx = _mm512_set_epi32(0, 13, 12, 11, 10, 9, 8, 7, 0, 6, 5, 4, 3, 2, 1, 0); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_8x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_4x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]); // Load 1 rows with n=14 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]); // Load 1 rows with n=15 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_8 = _mm512_loadu_si512(&a[(idx_m )*16]); // Load 2 rows with n=16 + matrixArray_9 = _mm512_loadu_si512(&a[(idx_m+2 )*16]); // Load 2 rows with n=16 + matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]); // Load 2 rows with n=16 + matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]); // Load 2 rows with n=16 + matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]); // Load 2 rows with n=16 + matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]); // Load 2 rows with n=16 + matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]); // Load 2 rows with n=16 + matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]); // Load 2 rows with n=16 + matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, \ + matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + + matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0); + matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1); + matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1); + matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n)); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|... + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512 accum512_0, accum512_1, accum512_2, accum512_3; + __m256 accum256; + __m128 accum128; + + if (tag_m_8x > 0) { + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = x512; + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..h[0:31] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_2 = _mm512_add_ps(accum512_2, accum512_3); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1)); + STORE8_COMPLETE_RESULT(accum256, y+idx_m) + } + + if (m - tag_m_8x > 3) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 4 rows from matrix + BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..d[0:31] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x) + tag_m_8x += 4; + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + BLASLONG tag_n_128x = n & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m512 accum512_bridge[8]; + __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + if (tag_m_8x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + for (int j = idx_m; j < idx_m + 8; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + // Temply save the result into a ZMM + accum512_bridge[j-idx_m] = accum512_t_0; + } + + FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge) + FP32_ACCUM2_8x16_ARRAY(accum512_bridge) + accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]); + accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]); + accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG j = tag_m_8x; j < m; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[j] = alpha * accum128[0] + beta * y[j]; +#else + y[j] = alpha * accum128[0] + y[j]; +#endif +#else +#ifndef ONE_ALPHA + y[j] = accum128[0] * alpha; +#else + y[j] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + if (tag_m_8x > 0) { + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 8 rows from matrix + BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n) + + // Load x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + if (tag_n_32x != n) { // Go with masked 512 + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask) + + // Load x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x16(accum512) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x16(accum512) + + accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4); + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4); + accum512_1 = _mm512_add_ps(accum512_1, accum512_2); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 32 elements from matrix + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n) + + // Load 32 elements from x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + if (tag_n_32x != n) { + // Load tail elements from matrix + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask) + + // Load 32 elements from x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m256i xArray256; + + // Keep align with other kernels and macro definition, the high 256bit is never used +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif + + __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ + accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15; + + __m256i M256_EPI32_4 = _mm256_set1_epi32(4); + __m256i idx_base_0 = _mm256_set_epi32(11, 10, 9, 8, 3, 2, 1, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_4); + + unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n)); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + + if (n == 16) { + BF16_VECTOR_LOAD_1x16(xArray256, x, 0) + } else { + BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask) + } + + if (n == 16) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); + y[i] += accum128[0] * alpha; + } + } + } else { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + } + + return 0; +} diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c new file mode 100644 index 000000000..3de586cb8 --- /dev/null +++ b/kernel/x86_64/srot.c @@ -0,0 +1,207 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "srot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "srot_microk_haswell-2.c" +#endif + +#ifndef HAVE_SROT_KERNEL +#include"../simd/intrin.h" + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + +#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f32 __c = v_setall_f32(c); + v_f32 __s = v_setall_f32(s); + v_f32 vx0, vx1, vx2, vx3; + v_f32 vy0, vy1, vy2, vy3; + v_f32 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f32(x + i); + vx1 = v_loadu_f32(x + i + vstep); + vx2 = v_loadu_f32(x + i + vstep * 2); + vx3 = v_loadu_f32(x + i + vstep * 3); + vy0 = v_loadu_f32(y + i); + vy1 = v_loadu_f32(y + i + vstep); + vy2 = v_loadu_f32(y + i + vstep * 2); + vy3 = v_loadu_f32(y + i + vstep * 3); + + vt0 = v_mul_f32(__s, vy0); + vt1 = v_mul_f32(__s, vy1); + vt2 = v_mul_f32(__s, vy2); + vt3 = v_mul_f32(__s, vy3); + + vt0 = v_muladd_f32(__c, vx0, vt0); + vt1 = v_muladd_f32(__c, vx1, vt1); + vt2 = v_muladd_f32(__c, vx2, vt2); + vt3 = v_muladd_f32(__c, vx3, vt3); + + v_storeu_f32(x + i, vt0); + v_storeu_f32(x + i + vstep, vt1); + v_storeu_f32(x + i + vstep * 2, vt2); + v_storeu_f32(x + i + vstep * 3, vt3); + + vt0 = v_mul_f32(__s, vx0); + vt1 = v_mul_f32(__s, vx1); + vt2 = v_mul_f32(__s, vx2); + vt3 = v_mul_f32(__s, vx3); + + vt0 = v_mulsub_f32(__c, vy0, vt0); + vt1 = v_mulsub_f32(__c, vy1, vt1); + vt2 = v_mulsub_f32(__c, vy2, vt2); + vt3 = v_mulsub_f32(__c, vy3, vt3); + + v_storeu_f32(y + i, vt0); + v_storeu_f32(y + i + vstep, vt1); + v_storeu_f32(y + i + vstep * 2, vt2); + v_storeu_f32(y + i + vstep * 3, vt3); + + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f32(x + i); + vy0 = v_loadu_f32(y + i); + + vt0 = v_mul_f32(__s, vy0); + vt0 = v_muladd_f32(__c, vx0, vt0); + v_storeu_f32(x + i, vt0); + + vt0 = v_mul_f32(__s, vx0); + vt0 = v_mulsub_f32(__c, vy0, vt0); + v_storeu_f32(y + i, vt0); + } +#else + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } +#endif + + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + srot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((float *)args->alpha)[0], + ((float *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c new file mode 100644 index 000000000..8e245cc8f --- /dev/null +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_8 = n&(~7); + BLASLONG tail_index_32 = n&(~31); + + __m256 c_256, s_256; + if (n >= 8) { + c_256 = _mm256_set1_ps(c); + s_256 = _mm256_set1_ps(s); + } + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + __m256 t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm256_loadu_ps(&x[i + 0]); + x1 = _mm256_loadu_ps(&x[i + 8]); + x2 = _mm256_loadu_ps(&x[i +16]); + x3 = _mm256_loadu_ps(&x[i +24]); + y0 = _mm256_loadu_ps(&y[i + 0]); + y1 = _mm256_loadu_ps(&y[i + 8]); + y2 = _mm256_loadu_ps(&y[i +16]); + y3 = _mm256_loadu_ps(&y[i +24]); + + t0 = _mm256_mul_ps(s_256, y0); + t1 = _mm256_mul_ps(s_256, y1); + t2 = _mm256_mul_ps(s_256, y2); + t3 = _mm256_mul_ps(s_256, y3); + + t0 = _mm256_fmadd_ps(c_256, x0, t0); + t1 = _mm256_fmadd_ps(c_256, x1, t1); + t2 = _mm256_fmadd_ps(c_256, x2, t2); + t3 = _mm256_fmadd_ps(c_256, x3, t3); + + _mm256_storeu_ps(&x[i + 0], t0); + _mm256_storeu_ps(&x[i + 8], t1); + _mm256_storeu_ps(&x[i +16], t2); + _mm256_storeu_ps(&x[i +24], t3); + + t0 = _mm256_mul_ps(s_256, x0); + t1 = _mm256_mul_ps(s_256, x1); + t2 = _mm256_mul_ps(s_256, x2); + t3 = _mm256_mul_ps(s_256, x3); + + t0 = _mm256_fmsub_ps(c_256, y0, t0); + t1 = _mm256_fmsub_ps(c_256, y1, t1); + t2 = _mm256_fmsub_ps(c_256, y2, t2); + t3 = _mm256_fmsub_ps(c_256, y3, t3); + + _mm256_storeu_ps(&y[i + 0], t0); + _mm256_storeu_ps(&y[i + 8], t1); + _mm256_storeu_ps(&y[i +16], t2); + _mm256_storeu_ps(&y[i +24], t3); + + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm256_loadu_ps(&x[i]); + y0 = _mm256_loadu_ps(&y[i]); + + t0 = _mm256_mul_ps(s_256, y0); + t0 = _mm256_fmadd_ps(c_256, x0, t0); + _mm256_storeu_ps(&x[i], t0); + + t0 = _mm256_mul_ps(s_256, x0); + t0 = _mm256_fmsub_ps(c_256, y0, t0); + _mm256_storeu_ps(&y[i], t0); + } + + for (i = tail_index_8; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c new file mode 100644 index 000000000..a21d1cf64 --- /dev/null +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -0,0 +1,91 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + __m512 c_512, s_512; + c_512 = _mm512_set1_ps(c); + s_512 = _mm512_set1_ps(s); + + BLASLONG tail_index_16 = n&(~15); + BLASLONG tail_index_64 = n&(~63); + + + __m512 x0, x1, x2, x3; + __m512 y0, y1, y2, y3; + __m512 t0, t1, t2, t3; + + for (i = 0; i < tail_index_64; i += 64) { + x0 = _mm512_loadu_ps(&x[i + 0]); + x1 = _mm512_loadu_ps(&x[i +16]); + x2 = _mm512_loadu_ps(&x[i +32]); + x3 = _mm512_loadu_ps(&x[i +48]); + y0 = _mm512_loadu_ps(&y[i + 0]); + y1 = _mm512_loadu_ps(&y[i +16]); + y2 = _mm512_loadu_ps(&y[i +32]); + y3 = _mm512_loadu_ps(&y[i +48]); + + t0 = _mm512_mul_ps(s_512, y0); + t1 = _mm512_mul_ps(s_512, y1); + t2 = _mm512_mul_ps(s_512, y2); + t3 = _mm512_mul_ps(s_512, y3); + + t0 = _mm512_fmadd_ps(c_512, x0, t0); + t1 = _mm512_fmadd_ps(c_512, x1, t1); + t2 = _mm512_fmadd_ps(c_512, x2, t2); + t3 = _mm512_fmadd_ps(c_512, x3, t3); + + _mm512_storeu_ps(&x[i + 0], t0); + _mm512_storeu_ps(&x[i +16], t1); + _mm512_storeu_ps(&x[i +32], t2); + _mm512_storeu_ps(&x[i +48], t3); + + t0 = _mm512_mul_ps(s_512, x0); + t1 = _mm512_mul_ps(s_512, x1); + t2 = _mm512_mul_ps(s_512, x2); + t3 = _mm512_mul_ps(s_512, x3); + + t0 = _mm512_fmsub_ps(c_512, y0, t0); + t1 = _mm512_fmsub_ps(c_512, y1, t1); + t2 = _mm512_fmsub_ps(c_512, y2, t2); + t3 = _mm512_fmsub_ps(c_512, y3, t3); + + _mm512_storeu_ps(&y[i + 0], t0); + _mm512_storeu_ps(&y[i +16], t1); + _mm512_storeu_ps(&y[i +32], t2); + _mm512_storeu_ps(&y[i +48], t3); + } + + for (i = tail_index_64; i < tail_index_16; i += 16) { + x0 = _mm512_loadu_ps(&x[i]); + y0 = _mm512_loadu_ps(&y[i]); + + t0 = _mm512_mul_ps(s_512, y0); + t0 = _mm512_fmadd_ps(c_512, x0, t0); + _mm512_storeu_ps(&x[i], t0); + + t0 = _mm512_mul_ps(s_512, x0); + t0 = _mm512_fmsub_ps(c_512, y0, t0); + _mm512_storeu_ps(&y[i], t0); + } + + + if ((n & 15) > 0) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); + __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]); + __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]); + __m512 temp = _mm512_mul_ps(s_512, tail_y); + temp = _mm512_fmadd_ps(c_512, tail_x, temp); + _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp); + temp = _mm512_mul_ps(s_512, tail_x); + temp = _mm512_fmsub_ps(c_512, tail_y, temp); + _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp); + } +} +#endif diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c new file mode 100644 index 000000000..6e758e2e3 --- /dev/null +++ b/kernel/x86_64/zasum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "zasum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zasum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c new file mode 100644 index 000000000..b44c53801 --- /dev/null +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -0,0 +1,340 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_ZASUM_KERNEL 1 + +#include + +#include + +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + __m128d abs_mask1; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); + abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + x04 = _mm_and_pd(x04, abs_mask1); + x05 = _mm_and_pd(x05, abs_mask1); + x06 = _mm_and_pd(x06, abs_mask1); + x07 = _mm_and_pd(x07, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + x00 = _mm_and_pd(x00, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/riscv64/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index f3ddde6a1..6c5e0f107 100644 --- a/param.h +++ b/param.h @@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -1454,22 +1455,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_R sgemm_r -//#define SGEMM_DEFAULT_R 1024 +/*#define SGEMM_DEFAULT_R 1024*/ #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r -//#define DGEMM_DEFAULT_R 1024 +/*#define DGEMM_DEFAULT_R 1024*/ #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_R cgemm_r -//#define CGEMM_DEFAULT_R 1024 +/*#define CGEMM_DEFAULT_R 1024*/ #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r -//#define ZGEMM_DEFAULT_R 1024 +/*#define ZGEMM_DEFAULT_R 1024*/ #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r @@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -2388,7 +2390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) || defined(POWER10) +#if defined(POWER9) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2426,6 +2428,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 832 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 1026 +#define DGEMM_DEFAULT_Q 960 +#define CGEMM_DEFAULT_Q 1026 +#define ZGEMM_DEFAULT_Q 1026 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 8 + #undef SBGEMM_DEFAULT_UNROLL_N #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_P @@ -2537,7 +2572,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A +#if defined(LOONGSON3R4) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#endif + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 + +#define SYMV_P 16 +#endif + +#if defined(LOONGSON3R3) ////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -2579,47 +2669,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 - -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 - -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 - -#define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 - -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 - -#define GEMM_OFFSET_A1 0x10000 -#define GEMM_OFFSET_B1 0x100000 - -#define SYMV_P 16 -#endif - #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2672,6 +2721,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#ifdef RISCV64_GENERIC +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + +#ifdef C910V +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 @@ -2752,7 +2879,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -// Common ARMv8 parameters +/* Common ARMv8 parameters */ #if defined(ARMV8) #define SNUMOPT 2 @@ -2955,7 +3082,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#else // Other/undetected ARMv8 cores +#else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2984,9 +3111,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#endif // Cores +#endif /* Cores */ -#endif // ARMv8 +#endif /* ARMv8 */ #if defined(ARMV5) #define SNUMOPT 2 diff --git a/test/Makefile b/test/Makefile index eb3bc3447..5f653414a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -23,7 +23,7 @@ endif level1: $(S1) $(D1) $(C1) $(Z1) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 endif @@ -83,7 +83,7 @@ endif level2: $(S2) $(D2) $(C2) $(Z2) -ifndef CROSS +ifneq ($(CROSS), 1) rm -f ?BLAT2.SUMM ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @@ -160,7 +160,7 @@ endif level3: $(B3) $(S3) $(D3) $(C3) $(Z3) -ifndef CROSS +ifneq ($(CROSS), 1) rm -f ?BLAT3.SUMM ifeq ($(BUILD_BFLOAT16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM @@ -232,7 +232,7 @@ endif level3_3m : zblat3_3m cblat3_3m -ifndef CROSS +ifneq ($(CROSS), 1) rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 @@ -258,6 +258,12 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) + +ifeq ($(CORE), C910V) +EXTRALIB = +CEXTRALIB = +endif + ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc5175fc5..0c99e0d12 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -27,13 +27,17 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) +if (NOT USE_OPENMP) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_fork.c ) endif() +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} + test_post_fork.c + ) endif() if (NOT NO_LAPACK) @@ -54,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) -if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") target_link_libraries(${OpenBLAS_utest_bin} m) endif() diff --git a/utest/Makefile b/utest/Makefile index 31d4ccf00..fad3607de 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -25,15 +25,19 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) +ifneq ($(USE_OPENMP), 1) OBJS += test_fork.o endif +OBJS += test_post_fork.o endif ifeq ($(C_COMPILER), PGI) OBJS = utest_main2.o endif +ifeq ($(C_COMPILER), SUN) +OBJS = utest_main2.o +endif ifeq ($(OSNAME), AIX) OBJS = utest_main2.o endif @@ -44,7 +48,7 @@ $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) -ifndef CROSS +ifneq ($(CROSS), 1) ./$(UTESTBIN) endif diff --git a/utest/test_fork.c b/utest/test_fork.c index 5c976f920..bd531e7fb 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "openblas_utest.h" -void* xmalloc(size_t n) +static void* xmalloc(size_t n) { void* tmp; tmp = malloc(n); @@ -49,7 +49,7 @@ void* xmalloc(size_t n) } #ifdef BUILD_DOUBLE -void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { char trans1 = 'T'; char trans2 = 'N'; diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c new file mode 100644 index 000000000..9370a02ce --- /dev/null +++ b/utest/test_post_fork.c @@ -0,0 +1,131 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include +#ifdef USE_OPENMP +#include +#endif +#include "openblas_utest.h" + +static void* xmalloc(size_t n) +{ + void* tmp; + tmp = malloc(n); + if (tmp == NULL) { + fprintf(stderr, "You are about to die\n"); + exit(1); + } else { + return tmp; + } +} + +#ifdef BUILD_DOUBLE +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +{ + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + int i; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); + for(i = 0; i < n * n; ++i) { + ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); + } +} +#endif + +CTEST(fork, safety_after_fork_in_parent) +{ +#ifndef BUILD_DOUBLE +exit(0); +#else + blasint n = 100; + int i, nthreads_omp; + + double *a, *b, *c, *d; + size_t n_bytes; + + pid_t fork_pid; + + n_bytes = sizeof(*a) * n * n; + + a = xmalloc(n_bytes); + b = xmalloc(n_bytes); + c = xmalloc(n_bytes); + d = xmalloc(n_bytes); + + // Put ones in a, b and n in c (result) + for(i = 0; i < n * n; ++i) { + a[i] = 1; + b[i] = 1; + c[i] = 1 * n; + } + + // Test that OpenBLAS works after a fork. + // This situation routinely happens with Pythons numpy where a + // `sys.platform` calls `uname` in a forked process. + // So we simulate this situation here. + + // There was an issue where a different number of OpenBLAS and OpenMP + // threads triggered a memory leak. So run this multiple times + // with different number of threads set. +#ifdef USE_OPENMP + nthreads_omp = omp_get_max_threads(); + // Run with half the max OMP threads, the max threads and twice that + for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) { + omp_set_num_threads(i); +#endif + + fork_pid = fork(); + if (fork_pid == -1) { + CTEST_ERR("Failed to fork process."); + } else if (fork_pid == 0) { + // Just pretend to do something, e.g. call `uname`, then exit + exit(0); + } else { + // Wait for the child to finish and check the exit code. + int child_status = 0; + pid_t wait_pid = wait(&child_status); + ASSERT_EQUAL(wait_pid, fork_pid); + ASSERT_EQUAL(0, WEXITSTATUS (child_status)); + + // Now OpenBLAS has to work + check_dgemm(a, b, d, c, n); + } +#ifdef USE_OPENMP + } +#endif + +#endif +}