diff --git a/.travis.yml b/.travis.yml
index 3f917ce72..bde0e202d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -211,44 +211,48 @@ matrix:
 
     - &test-macos
       os: osx
-      osx_image: xcode10.1
+      osx_image: xcode11.5
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-        - brew install gcc@8 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
 
     - <<: *test-macos
       osx_image: xcode12
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
         - brew update
-        - brew install gcc@10 # for gfortran
+        - brew install gcc@10
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
         - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
         
-    - <<: *test-macos
-      osx_image: xcode10.0
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
+  #  - <<: *test-macos
+  #    osx_image: xcode10
+  #    env:
+  #      - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
     - <<: *test-macos
-      osx_image: xcode10.1
+      osx_image: xcode11.5
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
       env:
-        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
         - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
-
     - <<: *test-macos
-      osx_image: xcode10.1
+      osx_image: xcode11.5
       env:
-        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
         - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
 
     - &test-graviton2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53c1709a8..12730e0e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 12)
+set(OpenBLAS_PATCH_VERSION 13)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7b994885a..be9a32a7c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -190,4 +190,7 @@ In chronological order:
   * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support
 
 * Danfeng Zhang <https://github.com/craft-zhang>
-  * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
\ No newline at end of file
+  * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
+
+* PingTouGe Semiconductor Co., Ltd.
+  * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
diff --git a/Changelog.txt b/Changelog.txt
index edd3563ec..cbc7007ac 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,54 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.13
+ 12-Dec-2020
+ 
+ common:
+	* Added a generic bfloat16 SBGEMV kernel
+	* Fixed a potentially severe memory leak after fork in OpenMP builds
+	  that was introduced in 0.3.12
+	* Added detection of the Fujitsu Fortran compiler
+	* Added detection of the (e)gfortran compiler on OpenBSD
+	* Added support for overriding the default name of the library independently
+	  from symbol suffixing in the gmake builds (already supported in cmake)
+
+RISCV:
+	* Added a RISC V port optimized for C910V
+
+POWER:
+	* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
+	* Improved DGEMM performance on POWER10
+	* Improved STRSM and DTRSM performance on POWER9 and POWER10
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+ 	* Fixed compilation with the PGI compiler
+
+x86:
+	* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
+	
+x86_64:
+	* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
+	* Improved the performance of SASUM and DASUM kernels through parallelization
+	* Improved the performance of SROT and DROT kernels
+	* Improved the performance of multithreaded xSYRK
+	* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
+	  (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
+	  wrong results)
+	* Fixed miscompilations by old gcc 4.6
+	* Fixed misdetection of AVX2 capability in some Sandybridge cpus
+	* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
+
+ARM64:
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+
+MIPS:
+	* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
+	* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
+	* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
+	* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
+
+SPARC:
+	* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
+
 ====================================================================
 Version 0.3.12
  24-Oct-2020
diff --git a/Makefile b/Makefile
index a9af62a22..54dd3be41 100644
--- a/Makefile
+++ b/Makefile
@@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
+ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
+	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
+else
 	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+endif
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
diff --git a/Makefile.arm b/Makefile.arm
index fac6b56824..a27b58e84 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp
 FCOMMON_OPT += -mfpu=vfp
 endif
+
+ifdef HAVE_NEON
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
+endif
diff --git a/Makefile.install b/Makefile.install
index 7c1a3ca43..e8b64465f 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
 OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
 OPENBLAS_BINARY_DIR := $(PREFIX)/bin
 OPENBLAS_BUILD_DIR := $(CURDIR)
-OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
+OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
 OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
 OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
@@ -150,13 +150,13 @@ endif
 endif
 
 #Generating openblas.pc
-	@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
-	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
+	@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
+	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 
 
 #Generating OpenBLASConfig.cmake
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 48fb5e991..d6395da7b 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500)
 TARGET_FLAGS = -mips64r6
 endif
 
+ifeq ($(TARGET), C910V)
+TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
new file mode 100644
index 000000000..15d7b059c
--- /dev/null
+++ b/Makefile.riscv64
@@ -0,0 +1,4 @@
+ifeq ($(CORE), C910V)
+CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
+FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
+endif
diff --git a/Makefile.rule b/Makefile.rule
index a4d11dc7c..e4b82104e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.12
+VERSION = 0.3.13
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.sparc b/Makefile.sparc
index 8895b96dd..61c7aa36d 100644
--- a/Makefile.sparc
+++ b/Makefile.sparc
@@ -3,21 +3,29 @@ RANLIB  = ranlib
 
 ifdef BINARY64
 
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mcpu=v9 -m64
+else
+CCOMMON_OPT +=  -m64
+endif
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT += -mcpu=v9 -m64
 endif
-ifeq ($(COMPILER_F77), f90)
-FCOMMON_OPT += -xarch=v9
+ifeq ($(COMPILER_F77), f95)
+FCOMMON_OPT += -m64
 endif
 else
 
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mcpu=v9
+else
+CCOMMON_OPT +=  -xarch=v9
+endif
 
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT += -mcpu=v9
 endif
-ifeq ($(COMPILER_F77), f90)
+ifeq ($(COMPILER_F77), f95)
 FCOMMON_OPT += -xarch=v8plusb
 endif
 
@@ -37,4 +45,4 @@ LIBSUNPERF	= -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \
 else
 LIBSUNPERF	= -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \
 		-Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath
-endif
\ No newline at end of file
+endif
diff --git a/Makefile.system b/Makefile.system
index 30d8f4ccf..5adde36d8 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -6,7 +6,7 @@
 INCLUDED = 1
 
 ifndef TOPDIR
-TOPDIR = .
+TOPDIR = . 
 endif
 
  # If ARCH is not set, we use the host system's architecture for getarch compile options.
@@ -93,6 +93,12 @@ endif
 ifdef TARGET
 GETARCH_FLAGS := -DFORCE_$(TARGET)
 GETARCH_FLAGS += -DUSER_TARGET
+ifeq ($(TARGET), GENERIC)
+ifeq ($(DYNAMIC_ARCH), 1)
+override NO_EXPRECISION=1
+export NO_EXPRECiSION
+endif
+endif
 endif
 
 # Force fallbacks for 32bit
@@ -246,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+HAVE_NEON=
+HAVE_VFP=
+HAVE_VFPV3=
+HAVE_VFPV4=
+HAVE_MMX=
+HAVE_SSE=
+HAVE_SSE2=
+HAVE_SSE3=
+HAVE_SSSE3=
+HAVE_SSE4_1=
+HAVE_SSE4_2=
+HAVE_SSE4A=
+HAVE_SSE5=
+HAVE_AVX=
+HAVE_AVX2=
+HAVE_FMA3=
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 
@@ -319,6 +341,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
 else
 	GCCDUMPVERSION_PARAM := -dumpversion
 endif
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
@@ -602,6 +625,10 @@ DYNAMIC_CORE += EMAG8180
 DYNAMIC_CORE += THUNDERX3T110
 endif
 
+ifeq ($(ARCH), mips64)
+DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
+endif
+
 ifeq ($(ARCH), zarch)
 DYNAMIC_CORE = ZARCH_GENERIC
 
@@ -649,7 +676,7 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT
@@ -728,7 +755,10 @@ endif
 endif
 endif
 
-
+ifeq ($(ARCH), riscv64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
 
 
 #
@@ -761,14 +791,9 @@ CCOMMON_OPT += -mabi=32
 BINARY_DEFINED = 1
 endif
 
-ifeq ($(CORE), LOONGSON3A)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
-endif
-
-ifeq ($(CORE), LOONGSON3B)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
+ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
+CCOMMON_OPT += -march=loongson3a
+FCOMMON_OPT += -march=loongson3a
 endif
 
 ifeq ($(CORE), MIPS24K)
@@ -810,7 +835,9 @@ endif
 ifndef BINARY_DEFINED
 ifneq ($(OSNAME), AIX)
 ifdef BINARY64
+ifneq ($(ARCH), riscv64)
 CCOMMON_OPT += -m64
+endif
 else
 CCOMMON_OPT += -m32
 endif
@@ -855,7 +882,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
 FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(OSNAME), Linux)
 ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
 ifeq ($(FLANG_VENDOR),AOCC)
 FCOMMON_OPT += -fno-unroll-loops
 endif
@@ -931,8 +958,10 @@ endif
 else
 ifdef BINARY64
 ifneq ($(OSNAME), AIX)
+ifneq ($(ARCH), riscv64)
 FCOMMON_OPT += -m64
 endif
+endif
 ifdef INTERFACE64
 ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT +=  -fdefault-integer-8
@@ -1048,11 +1077,11 @@ FCOMMON_OPT += -n32
 else
 FCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 FCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 FCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1078,11 +1107,11 @@ CCOMMON_OPT += -n32
 else
 CCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 CCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 CCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1101,16 +1130,25 @@ CCOMMON_OPT  += -w
 ifeq ($(ARCH), x86)
 CCOMMON_OPT  += -m32
 else
-FCOMMON_OPT  += -m64
+ifdef BINARY64
+CCOMMON_OPT  += -m64
+else
+CCOMMON_OPT  += -m32
+endif
 endif
 endif
 
 ifeq ($(F_COMPILER), SUN)
 CCOMMON_OPT  += -DF_INTERFACE_SUN
+FCOMMON_OPT  += -ftrap=%none -xrecursive
 ifeq ($(ARCH), x86)
 FCOMMON_OPT  += -m32
 else
+ifdef BINARY64
 FCOMMON_OPT  += -m64
+else
+FCOMMON_OPT  += -m32
+endif
 endif
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -xopenmp=parallel
@@ -1184,10 +1222,8 @@ ifdef SMP
 CCOMMON_OPT	+= -DSMP_SERVER
 
 ifeq ($(ARCH), mips64)
-ifneq ($(CORE), LOONGSON3B)
 USE_SIMPLE_THREADED_LEVEL3 = 1
 endif
-endif
 
 ifeq ($(USE_OPENMP), 1)
 # USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -1262,10 +1298,14 @@ ifndef SYMBOLSUFFIX
 SYMBOLSUFFIX =
 endif
 
+ifndef LIBSONAMEBASE
+LIBSONAMEBASE = openblas
+endif
+
 ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)
@@ -1279,8 +1319,10 @@ KERNELDIR	= $(TOPDIR)/kernel/$(ARCH)
 include $(TOPDIR)/Makefile.$(ARCH)
 
 ifneq ($(C_COMPILER), PGI)
+ifneq ($(C_COMPILER), SUN)
 CCOMMON_OPT     += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
 endif
+endif
 CCOMMON_OPT	+= -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
 
 ifeq ($(CORE), PPC440)
@@ -1297,11 +1339,9 @@ endif
 
 ifneq ($(ARCH), x86_64)
 ifneq ($(ARCH), x86)
-ifneq ($(CORE), LOONGSON3B)
 NO_AFFINITY = 1
 endif
 endif
-endif
 
 ifdef NO_AFFINITY
 ifeq ($(NO_AFFINITY), 0)
@@ -1515,6 +1555,8 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
+export HAVE_AVX2
+export HAVE_FMA3
 export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4
@@ -1525,6 +1567,7 @@ export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
 export NO_AVX512
+export NO_AVX2
 export BUILD_BFLOAT16
 
 export SBGEMM_UNROLL_M
diff --git a/Makefile.x86 b/Makefile.x86
index 330690935..0e27264d8 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -1,5 +1,10 @@
 # COMPILER_PREFIX = mingw32-
 
+ifdef HAVE_SSE
+CCOMMON_OPT += -msse
+FCOMMON_OPT += -msse
+endif
+
 
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x86
@@ -54,9 +59,11 @@ LIBATLAS	= -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
 else
 LIBATLAS	= -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
 endif
-
+ifdef HAVE_SSE2
+CCOMMON_OPT += -msse2
+FCOMMON_OPT += -msse2
+endif
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
 ifdef HAVE_SSSE3
@@ -68,5 +75,4 @@ CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
 endif
-endif
 
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index a849f0b01..00967bcb6 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -9,9 +9,9 @@ endif
 endif
 
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
+endif
 ifdef HAVE_SSSE3
 CCOMMON_OPT += -mssse3
 FCOMMON_OPT += -mssse3
@@ -20,6 +20,22 @@ ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
+ifndef OLDGCC
+ifdef HAVE_AVX
+CCOMMON_OPT += -mavx
+FCOMMON_OPT += -mavx
+endif
+endif
+ifndef NO_AVX2
+ifdef HAVE_AVX2
+CCOMMON_OPT += -mavx2
+FCOMMON_OPT += -mavx2
+endif
+endif
+ifndef OLDGCC
+ifdef HAVE_FMA3
+CCOMMON_OPT += -mfma
+FCOMMON_OPT += -mfma
 endif
 endif
 
@@ -47,8 +63,6 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
 CCOMMON_OPT += -march=cooperlake
 FCOMMON_OPT += -march=cooperlake
@@ -68,15 +82,11 @@ endif
 endif
 endif
 
-ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
-ifndef DYNAMIC_ARCH
+ifdef HAVE_AVX2
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 CCOMMON_OPT += -mavx2
 endif
@@ -101,7 +111,6 @@ endif
 endif
 endif
 endif
-endif
 
 
 
diff --git a/README.md b/README.md
index ca034e747..267df5358 100644
--- a/README.md
+++ b/README.md
@@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Z13**: Optimized Level-3 BLAS and Level-1,2
 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
 
+#### RISC-V
+
+- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
+  ```sh
+  make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/TargetList.txt b/TargetList.txt
index 66eca4506..d19964916 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -104,3 +104,8 @@ VORTEX
 ZARCH_GENERIC
 Z13
 Z14
+
+10.RISC-V 64:
+RISCV64_GENERIC
+C910V
+
diff --git a/benchmark/amax.c b/benchmark/amax.c
index 19ae95c8b..29310dd71 100644
--- a/benchmark/amax.c
+++ b/benchmark/amax.c
@@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMAX
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(dzamax)
+#define AMAX BLASFUNC(dzamax)
 #else
-#define AMAX   BLASFUNC(scamax)
+#define AMAX BLASFUNC(scamax)
 #endif
 #else
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(damax)
+#define AMAX BLASFUNC(damax)
 #else
-#define AMAX   BLASFUNC(samax)
+#define AMAX BLASFUNC(samax)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
+  double time1, timeg;
 
-  struct timeval start, stop;
-  double time1,timeg;
+  argc--;
+  argv++;
 
-  argc--;argv++;
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -152,37 +100,31 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
+    fprintf(stderr, " %6d : ", (int)m);
 
-   fprintf(stderr, " %6d : ", (int)m);
+    for (l = 0; l < loops; l++)
+    {
 
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	gettimeofday( &start, (struct timezone *)0);
-    	AMAX (&m, x, &inc_x);
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+      begin();
+      AMAX(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/amin.c b/benchmark/amin.c
index d0cadbd3b..54a1d266a 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -25,124 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMIN
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(dzamin)
+#define AMIN BLASFUNC(dzamin)
 #else
-#define AMIN   BLASFUNC(scamin)
+#define AMIN BLASFUNC(scamin)
 #endif
 #else
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(damin)
+#define AMIN BLASFUNC(damin)
 #else
-#define AMIN   BLASFUNC(samin)
+#define AMIN BLASFUNC(samin)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
-  struct timeval start, stop;
-  double time1,timeg;
+  double time1, timeg;
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -151,39 +100,35 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
 
-   fprintf(stderr, " %6d : ", (int)m);
+    fprintf(stderr, " %6d : ", (int)m);
 
+    for (l = 0; l < loops; l++)
+    {
 
-   for (l=0; l<loops; l++)
-   {
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
+      begin();
 
-    	gettimeofday( &start, (struct timezone *)0);
+      AMIN(&m, x, &inc_x);
 
-    	AMIN (&m, x, &inc_x);
-
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+      end();
 
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/asum.c b/benchmark/asum.c
index bcccd9089..098ddc8ee 100644
--- a/benchmark/asum.c
+++ b/benchmark/asum.c
@@ -25,132 +25,74 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef ASUM
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dzasum)
+#define ASUM BLASFUNC(dzasum)
 #else
-#define ASUM   BLASFUNC(scasum)
+#define ASUM BLASFUNC(scasum)
 #endif
 #else
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dasum)
+#define ASUM BLASFUNC(dasum)
 #else
-#define ASUM   BLASFUNC(sasum)
+#define ASUM BLASFUNC(sasum)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   FLOAT result;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-  struct timeval start, stop;
-  double time1,timeg;
-#else
-  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+  int from = 1;
+  int to = 200;
+  int step = 1;
   double time1, timeg;
-#endif
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
   }
 
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
 
 #ifdef __linux
   srandom(getpid());
@@ -158,45 +100,33 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
 
-   fprintf(stderr, " %6d : ", (int)m);
+    fprintf(stderr, " %6d : ", (int)m);
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	gettimeofday( &start, (struct timezone *)0);
-#else
-        clock_gettime(CLOCK_REALTIME, &start);
-#endif
-    	result = ASUM (&m, x, &inc_x);
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	clock_gettime(CLOCK_REALTIME, &stop);
-   	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-#else
-  	gettimeofday( &stop, (struct timezone *)0);
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
-#endif
-
-	timeg += time1;
+    for (l = 0; l < loops; l++)
+    {
 
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+      begin();
+      result = ASUM(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
-if (loops >1)
-    timeg /= loops;
+    if (loops > 1)
+      timeg /= loops;
 
 #ifdef COMPLEX
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
 #else
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
 #endif
-
   }
 
   return 0;
diff --git a/benchmark/axpby.c b/benchmark/axpby.c
index 793ee7e40..d02d9a889 100644
--- a/benchmark/axpby.c
+++ b/benchmark/axpby.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPBY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -176,16 +104,10 @@ int main(int argc, char *argv[]){
 
     for (l=0; l<loops; l++)
     {
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
         AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
-
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        timeg += time1;
-
+        end();
+        timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/axpy.c b/benchmark/axpy.c
index 760703c1d..5a7dead33 100644
--- a/benchmark/axpy.c
+++ b/benchmark/axpy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -127,8 +56,6 @@ int main(int argc, char *argv[]){
   int from =   1;
   int to   = 200;
   int step =   1;
-
-  struct timespec start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	clock_gettime( CLOCK_REALTIME, &start);
+    	begin();
 
     	AXPY (&m, alpha, x, &inc_x, y, &inc_y );
 
-    	clock_gettime( CLOCK_REALTIME, &stop);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/bench.h b/benchmark/bench.h
new file mode 100644
index 000000000..1f9b8986c
--- /dev/null
+++ b/benchmark/bench.h
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+
+#define malloc huge_malloc
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+  struct timeval start, stop;
+#else
+  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+#endif
+
+double getsec()
+{
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+#else
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+#endif
+}
+
+void begin() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &start, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &start);
+#endif
+}
+
+void end() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &stop, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &stop);
+#endif
+}
\ No newline at end of file
diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c
index 5908b6085..65b20d039 100644
--- a/benchmark/cholesky.c
+++ b/benchmark/cholesky.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -71,41 +66,6 @@ double fabs(double);
 #endif
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-
 static __inline double getmflops(int ratio, int m, double secs){
 
   double mm = (double)m;
@@ -145,7 +105,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -220,20 +179,19 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Info = %d\n", info);
 	exit(1);
       }
 
-     time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+     time1 = getsec();
 
-      maxerr = 0.;
 
       if (!(uplos & 1)) {
 	for (j = 0; j < m; j++) {
diff --git a/benchmark/copy.c b/benchmark/copy.c
index eb5148fff..c5e447521 100644
--- a/benchmark/copy.c
+++ b/benchmark/copy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef COPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,11 +57,9 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1 = 0.0, timeg = 0.0;
   long nanos = 0;
   time_t seconds = 0;
-  struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
 
   argc--;argv++;
 
@@ -176,15 +103,10 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-       clock_gettime(CLOCK_REALTIME, &time_start);
+       begin();
        COPY (&m, x, &inc_x, y, &inc_y );
-       clock_gettime(CLOCK_REALTIME, &time_end);
-
-       nanos = time_end.tv_nsec - time_start.tv_nsec;
-       seconds = time_end.tv_sec - time_start.tv_sec;
-
-       time1 = seconds + nanos / 1.e9;
-       timeg += time1;
+       end();
+       timeg += getsec();
    }
 
       timeg /= loops;
diff --git a/benchmark/dot.c b/benchmark/dot.c
index aae3c04b0..72a756249 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(ddot)
 #else
 #define DOT   BLASFUNC(sdot)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -122,7 +49,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -169,15 +95,12 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+    	end();
+	    timeg += getsec();
 
     }
 
diff --git a/benchmark/geev.c b/benchmark/geev.c
index 4fd2c8d6f..6e22cdfb6 100644
--- a/benchmark/geev.c
+++ b/benchmark/geev.c
@@ -36,13 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEEV
 
@@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
                 FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
@@ -154,7 +83,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -223,7 +151,7 @@ int main(int argc, char *argv[]){
   for(m = from; m <= to; m += step){
 
     fprintf(stderr, " %6d : ", (int)m);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
 #ifndef COMPLEX
@@ -239,14 +167,14 @@ int main(int argc, char *argv[]){
     GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
 #endif
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 8cd14bbed..35f5096f3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   IFLOAT *a, *b;
@@ -139,7 +68,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1, timeg;
 
   argc--;argv++;
@@ -228,14 +156,14 @@ int main(int argc, char *argv[]){
     ldc = m;
 
     fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     for (j=0; j<loops; j++) {
       GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
     }
 
-    gettimeofday( &stop, (struct timezone *)0);
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    end();
+    time1 = getsec();
 
     timeg = time1/loops;
     fprintf(stderr,
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
index 98c13e1be..f505ca049 100644
--- a/benchmark/gemm3m.c
+++ b/benchmark/gemm3m.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -133,7 +62,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -187,16 +115,12 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+	    timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index fb1f541d3..a0001277a 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef GEMV
@@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -211,10 +139,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
@@ -248,10 +176,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
diff --git a/benchmark/ger.c b/benchmark/ger.c
index d53d328f0..7ce08c3ad 100644
--- a/benchmark/ger.c
+++ b/benchmark/ger.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GER
 
@@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -131,7 +59,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -198,16 +125,13 @@ int main(int argc, char *argv[]){
     for (l=0; l<loops; l++)
     {
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+      
+      timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/gesv.c b/benchmark/gesv.c
index 057cbd243..1806b31be 100644
--- a/benchmark/gesv.c
+++ b/benchmark/gesv.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -66,71 +61,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -142,7 +72,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -194,22 +123,18 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GESV (&m, &m, a, &m, ipiv, b, &m,  &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
-
-
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
+    end();
 
+    time1 = getsec();
 
     fprintf(stderr,
 	    "%10.2f MFlops %10.6f s\n",
 	    COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
 
-
   }
 
   return 0;
diff --git a/benchmark/getri.c b/benchmark/getri.c
index a07014768..98a860906 100644
--- a/benchmark/getri.c
+++ b/benchmark/getri.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef GETRF
 #undef GETRI
@@ -72,71 +67,6 @@
 
 extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*work;
@@ -148,7 +78,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -205,21 +134,21 @@ int main(int argc, char *argv[]){
       exit(1);
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
     GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
 
     lwork = (blasint)wkopt[0];
     GETRI(&m, a, &m, ipiv, work, &lwork, &info);
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c
index 60ba9fb89..35249bdf9 100644
--- a/benchmark/hbmv.c
+++ b/benchmark/hbmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HBMV
 
-
 #ifdef DOUBLE
 #define HBMV   BLASFUNC(zhbmv)
 #else
 #define HBMV   BLASFUNC(chbmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -125,7 +52,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -186,15 +112,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        timeg += time1;
+        timeg += getsec();
 
     }
 
diff --git a/benchmark/hemm.c b/benchmark/hemm.c
index 2bc165458..a0a9985ad 100644
--- a/benchmark/hemm.c
+++ b/benchmark/hemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMM
 
@@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HEMM   BLASFUNC(chemm)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -126,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/hemv.c b/benchmark/hemv.c
index 98618a04e..ad130ddd0 100644
--- a/benchmark/hemv.c
+++ b/benchmark/hemv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMV
 
-
 #ifdef DOUBLE
 #define HEMV   BLASFUNC(zhemv)
 #else
 #define HEMV   BLASFUNC(chemv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +108,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/her.c b/benchmark/her.c
index 010f8120d..cd1fb7f48 100644
--- a/benchmark/her.c
+++ b/benchmark/her.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER
 
-
 #ifdef DOUBLE
 #define HER   BLASFUNC(zher)
 #else
 #define HER   BLASFUNC(cher)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x;
@@ -126,8 +53,6 @@ int main(int argc, char *argv[]){
     int from =   1;
     int to   = 200;
     int step =   1;
-
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -166,15 +91,13 @@ int main(int argc, char *argv[]){
             x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HER (&uplo, &m, alpha, x, &incx, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2.c b/benchmark/her2.c
index 0f80f3ed9..d87bfd466 100644
--- a/benchmark/her2.c
+++ b/benchmark/her2.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2
 
-
 #ifdef DOUBLE
 #define HER2   BLASFUNC(zher2)
 #else
 #define HER2   BLASFUNC(cher2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -169,16 +95,13 @@ int main(int argc, char *argv[]){
             y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
 
         HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2k.c b/benchmark/her2k.c
index 021873beb..d3cdce696 100644
--- a/benchmark/her2k.c
+++ b/benchmark/her2k.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2K
 #ifdef DOUBLE
@@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HER2K   BLASFUNC(cher2k)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -169,13 +96,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/herk.c b/benchmark/herk.c
index c09d35c1f..628dc2c11 100644
--- a/benchmark/herk.c
+++ b/benchmark/herk.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HERK
 
-
 #ifdef DOUBLE
 #define HERK   BLASFUNC(zherk)
 #else
 #define HERK   BLASFUNC(cherk)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -167,18 +93,17 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
-
   }
 
   return 0;
diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c
index b0157094e..907e2adc4 100644
--- a/benchmark/hpmv.c
+++ b/benchmark/hpmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HPMV
 
-
 #ifdef DOUBLE
 #define HPMV   BLASFUNC(zhpmv)
 #else
 #define HPMV   BLASFUNC(chpmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -183,13 +109,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
 
         timeg += time1;
 
diff --git a/benchmark/iamax.c b/benchmark/iamax.c
index c87044ab4..15618cbcc 100644
--- a/benchmark/iamax.c
+++ b/benchmark/iamax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMAX
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/iamin.c b/benchmark/iamin.c
index e7c8e59e4..a57638ecc 100644
--- a/benchmark/iamin.c
+++ b/benchmark/iamin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMIN
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imax.c b/benchmark/imax.c
index b56ef64ba..b96b17167 100644
--- a/benchmark/imax.c
+++ b/benchmark/imax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imin.c b/benchmark/imin.c
index 4a92c8bd0..095eacca9 100644
--- a/benchmark/imin.c
+++ b/benchmark/imin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/linpack.c b/benchmark/linpack.c
index 661a44175..202035245 100644
--- a/benchmark/linpack.c
+++ b/benchmark/linpack.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -72,71 +67,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1, time2;
 
   argc--;argv++;
@@ -198,31 +127,31 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRF (&m, &m, a, &m, ipiv, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time2 = getsec();
 
     maxerr = 0.;
 
diff --git a/benchmark/max.c b/benchmark/max.c
index a19a386a2..301b943a5 100644
--- a/benchmark/max.c
+++ b/benchmark/max.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/min.c b/benchmark/min.c
index 4df8fb0fd..39df37a29 100644
--- a/benchmark/min.c
+++ b/benchmark/min.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c
index 0f416621a..cd64d564a 100644
--- a/benchmark/nrm2.c
+++ b/benchmark/nrm2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NRM2
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NRM2 (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index cb4c23bab..116d0cca5 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -86,37 +81,7 @@ double fabs(double);
 // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
 // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
 
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
 
 int main(int argc, char *argv[]){
 
@@ -141,7 +106,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -217,18 +181,18 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Potrf info = %d\n", info);
 	exit(1);
       }
 
-      time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+      time1 = getsec();
       flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
 
       if ( btest == 'S' )
@@ -240,17 +204,17 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRS(uplo[uplos], &m, &m, b, &m, a, &m,  &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potrs info = %d\n", info);
 		exit(1);
         }
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
 
       }
@@ -258,18 +222,18 @@ int main(int argc, char *argv[]){
       if ( btest == 'I' )
       {
 	
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRI(uplo[uplos], &m, b, &m, &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potri info = %d\n", info);
 		exit(1);
         }
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
       }
 	
diff --git a/benchmark/rot.c b/benchmark/rot.c
index 69698988d..15b630e36 100644
--- a/benchmark/rot.c
+++ b/benchmark/rot.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROT
 
@@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -133,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -179,13 +108,13 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	ROT (&m, x, &inc_x, y, &inc_y, c, s);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	    timeg += time1;
 
diff --git a/benchmark/rotm.c b/benchmark/rotm.c
index 17c8d5416..7f333e220 100644
--- a/benchmark/rotm.c
+++ b/benchmark/rotm.c
@@ -25,12 +25,7 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROTM
 
@@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ROTM BLASFUNC(srotm)
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz)
-{
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv) {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10; /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =
-             shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                    SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {
-        printf("Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf("Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -122,7 +51,7 @@ int main(int argc, char *argv[])
     int to = 200;
     int step = 1;
 
-    struct timeval start, stop;
+    
     double time1, timeg;
 
     argc--;
@@ -188,14 +117,13 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            gettimeofday(&start, (struct timezone *)0);
+            begin();
 
             ROTM(&m, x, &inc_x, y, &inc_y, param);
 
-            gettimeofday(&stop, (struct timezone *)0);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) +
-                    (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+            time1 = getsec();
 
             timeg += time1;
         }
diff --git a/benchmark/scal.c b/benchmark/scal.c
index 8bd62c77c..8de6cfd04 100644
--- a/benchmark/scal.c
+++ b/benchmark/scal.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SCAL
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +57,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -174,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SCAL (&m, alpha, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spmv.c b/benchmark/spmv.c
index cff504d3b..e4dcbf4ae 100644
--- a/benchmark/spmv.c
+++ b/benchmark/spmv.c
@@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPMV
 
-
 #ifndef COMPLEX
 
 #ifdef DOUBLE
@@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -135,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -193,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spr.c b/benchmark/spr.c
index 5dcaa4f8b..2fc9994f8 100755
--- a/benchmark/spr.c
+++ b/benchmark/spr.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPR
 
@@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SPR   BLASFUNC(sspr)
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*c;
@@ -129,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -173,13 +99,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR (&uplo, &m, alpha, c, &inc_x, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
index a5f2791f7..8f194e83a 100755
--- a/benchmark/spr2.c
+++ b/benchmark/spr2.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SPR2
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*b,*c;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +110,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/swap.c b/benchmark/swap.c
index 76d545995..64ebe5e9b 100644
--- a/benchmark/swap.c
+++ b/benchmark/swap.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SWAP
@@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +104,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SWAP (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/symm.c b/benchmark/symm.c
index bb9849eb5..1c6d91d00 100644
--- a/benchmark/symm.c
+++ b/benchmark/symm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/symv.c b/benchmark/symv.c
index e4c892b5a..0a35aaef0 100644
--- a/benchmark/symv.c
+++ b/benchmark/symv.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMV
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -134,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -192,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/syr.c b/benchmark/syr.c
index a9dd293e6..ebbf2bd3c 100644
--- a/benchmark/syr.c
+++ b/benchmark/syr.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x,*a;
@@ -124,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
   
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -165,13 +93,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2.c b/benchmark/syr2.c
index 9efbca315..acbc86987 100644
--- a/benchmark/syr2.c
+++ b/benchmark/syr2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYR2
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYR2   BLASFUNC(ssyr2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y, *a;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -174,13 +101,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c
index a906559eb..3895c2861 100644
--- a/benchmark/syr2k.c
+++ b/benchmark/syr2k.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR2K
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +67,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +110,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index 0fbb943f6..82606a21a 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYRK
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -177,13 +105,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c
index fe9d07534..41f2e0fb8 100644
--- a/benchmark/tpmv.c
+++ b/benchmark/tpmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c
index 8472ac261..ebfa29692 100644
--- a/benchmark/tpsv.c
+++ b/benchmark/tpsv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPSV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trmm.c b/benchmark/trmm.c
index 23af122b4..3ab9fc255 100644
--- a/benchmark/trmm.c
+++ b/benchmark/trmm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRMM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -141,7 +71,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -180,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops  %10.6f sec\n",
diff --git a/benchmark/trmv.c b/benchmark/trmv.c
index 46641b3e4..0e8088b54 100644
--- a/benchmark/trmv.c
+++ b/benchmark/trmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TRMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trsm.c b/benchmark/trsm.c
index 17676946a..d2ebd7f54 100644
--- a/benchmark/trsm.c
+++ b/benchmark/trsm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRSM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -196,13 +125,13 @@ int main(int argc, char *argv[]){
       		 	}
     		 }
 
-    		gettimeofday( &start, (struct timezone *)0);
+    		begin();
 
     		TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    		gettimeofday( &stop, (struct timezone *)0);
+    		end();
 
-    		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    		time1 = getsec();
 
 		timeg += time1;
         }
diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 1734e2adb..66ac3a3c7 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include <time.h>
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMV
 #undef TRSV
@@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x;
@@ -133,7 +61,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timespec time_start, time_end;
   time_t seconds = 0;
 
   double time1,timeg;
@@ -189,19 +116,13 @@ int main(int argc, char *argv[]){
 
       for(l =0;l< loops;l++){
 
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start);
-
+          begin();
           TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
-
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end);
-          nanos = time_end.tv_nsec - time_start.tv_nsec;
-          seconds = time_end.tv_sec - time_start.tv_sec;
-
-          time1 = seconds + nanos /1.e9;
+          end();
+          time1 = getsec();
           timeg += time1;
       }
 
-
       timeg /= loops;
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c
index ba1515365..06cdde13a 100644
--- a/benchmark/zdot-intel.c
+++ b/benchmark/zdot-intel.c
@@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#define RETURN_BY_STACK 1
-#include "common.h"
+#include "bench.h"
 
+#define RETURN_BY_STACK 1
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(zdotu)
 #else
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -123,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	DOT (&result, &m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/zdot.c b/benchmark/zdot.c
index fa624e859..23b3efcad 100644
--- a/benchmark/zdot.c
+++ b/benchmark/zdot.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -122,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -169,15 +96,15 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 #ifdef RETURN_BY_STACK
     	DOT (&result , &m, x, &inc_x, y, &inc_y );
 #else
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 #endif
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/c_check b/c_check
index 5ea93b75c..970d475d7 100644
--- a/c_check
+++ b/c_check
@@ -6,7 +6,8 @@
 # Checking cross compile
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
 $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
-$hostarch = `uname -p` if ($hostos eq "AIX");
+$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
+chop($hostarch);
 $hostarch = "x86_64" if ($hostarch eq "amd64");
 $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
 $hostarch = "arm64" if ($hostarch eq "aarch64");
@@ -92,6 +93,7 @@ $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
 
 $defined = 0;
 
@@ -136,6 +138,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
+if ($architecture eq "riscv64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -192,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     } else {
 	$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$code = '"addvi.b $w0, $w1, 1"';
-	$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
+	$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
 	print $tmpf "#include <msa.h>\n\n";
 	print $tmpf "void main(void){ __asm__ volatile($code); }\n";
 
@@ -270,6 +277,15 @@ if ($data =~ /HAVE_C11/) {
     }
 }
 
+if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
+	$no_avx2 = 0;
+	$oldgcc = 0;
+	$data = `$compiler_name -dumpversion`;
+	if ($data <= 4.6) {
+		$no_avx2 = 1;
+		$oldgcc = 1;
+	}
+}
 
 $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
 
@@ -362,6 +378,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
 print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
 print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
 print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
+print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
+print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
diff --git a/cblas.h b/cblas.h
index bf310bed2..da00d46d6 100644
--- a/cblas.h
+++ b/cblas.h
@@ -393,6 +393,7 @@ void   cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
 void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
 /* dot production of BFLOAT16 input arrays, and output as float */
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
+void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 
 #ifdef __cplusplus
 }
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 2f4d1c6d7..76952152b 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "SKYLAKEX")
+if (${CORE} STREQUAL SKYLAKEX)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
@@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "COOPERLAKE")
+if (${CORE} STREQUAL COOPERLAKE)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
@@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx")
 	endif ()
+	if (HAVE_FMA3)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
+	endif ()
 	if (HAVE_SSE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -msse")
 	endif ()
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 7d7f5ffda..0c102bae5 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -184,8 +184,8 @@ macro(SetDefaultL2)
   set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
   set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
 if (BUILD_BFLOAT16)
-  set(SBGEMVNKERNEL ../arm/gemv_n.c)
-  set(SBGEMVTKERNEL ../arm/gemv_t.c)
+  set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
   set(SHGERKERNEL ../generic/ger.c)
 endif ()
 endmacro ()
diff --git a/cmake/os.cmake b/cmake/os.cmake
index c644bc3f7..e24059dd5 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,6 +84,14 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
+if (DYNAMIC_ARCH)
+if (TARGET)
+if (${TARGET} STREQUAL "GENERIC")
+  set(NO_EXPRECISION 1)
+endif ()
+endif ()
+endif ()
+
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
   set(SANITY_CHECK 1)
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 3e38abbf5..da7686c33 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -556,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING)
       MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
     endif ()
   endif ()
+  unset (HAVE_AVX2)
+  unset (HAVE_AVX)
+  unset (HAVE_FMA3)
+  unset (HAVE_MMX)
+  unset (HAVE_SSE)
+  unset (HAVE_SSE2)
+  unset (HAVE_SSE3)
+  unset (HAVE_SSSE3)
+  unset (HAVE_SSE4A)
+  unset (HAVE_SSE4_1)
+  unset (HAVE_SSE4_2)
+  unset (HAVE_NEON)
+  unset (HAVE_VFP)
+  unset (HAVE_VFPV3)
+  unset (HAVE_VFPV4)
   message(STATUS "Running getarch")
 
   # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 4cc46236d..66e95c6d3 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -44,50 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   endif ()
 endif ()
 
-if (DEFINED TARGET)
-  if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
-#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
-        else()
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-        endif()
-#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-#    endif()    
-  endif()
-  if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-  endif()
-  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
-    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
-        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-      endif()
-    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-    endif()
-  endif()
-  if (DEFINED HAVE_SSE)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
-  endif()
-  if (DEFINED HAVE_SSE2)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
-  endif()
-  if (DEFINED HAVE_SSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-    if (DEFINED HAVE_SSSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
-  endif()
-    if (DEFINED HAVE_SSE4_1)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
-  endif()
-endif()
 
 if (DEFINED TARGET)
+  message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
   message(STATUS "Targeting the ${TARGET} architecture.")
   set(GETARCH_FLAGS "-DFORCE_${TARGET}")
 endif ()
@@ -187,6 +146,63 @@ else()
 endif ()
 
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
+if (DEFINED TARGET)
+  if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
+#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+#    endif()    
+  endif()
+  if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+  endif()
+  if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
+        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      endif()
+    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+    endif()
+  endif()
+  if (DEFINED HAVE_AVX)
+	if (NOT NO_AVX)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
+	endif()
+  endif()
+  if (DEFINED HAVE_AVX2)
+	if (NOT NO_AVX2)
+      	  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+	endif()
+  endif()
+  if (DEFINED HAVE_FMA3)
+	if (NOT NO_AVX2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+	endif()
+  endif()
+    if (DEFINED HAVE_SSE)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
+  endif()
+  if (DEFINED HAVE_SSE2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
+  endif()
+  if (DEFINED HAVE_SSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+    if (DEFINED HAVE_SSSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
+  endif()
+    if (DEFINED HAVE_SSE4_1)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
+  endif()
+endif()
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()
diff --git a/common.h b/common.h
index a3ef99b59..2825407cb 100644
--- a/common.h
+++ b/common.h
@@ -437,6 +437,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
+    
+#ifdef ARCH_RISCV64
+#include "common_riscv64.h"
+#endif
+
 #ifdef ARCH_MIPS64
 #include "common_mips64.h"
 #endif
diff --git a/common_arm64.h b/common_arm64.h
index 314946282..9cdded305 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -142,14 +142,8 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
-#if defined(CORTEXA57)
-#define BUFFER_SIZE     (20 << 20)
-#elif defined(TSV110) || defined(EMAG8180)
 #define BUFFER_SIZE     (32 << 20)
 #else
-#define BUFFER_SIZE     (16 << 20)
-#endif
-#else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif
 
diff --git a/common_interface.h b/common_interface.h
index 032877fe1..b9ebb2772 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 void BLASFUNC(xgerc)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 
+void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float  *, bfloat16 *, blasint *,
+            bfloat16  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
diff --git a/common_level2.h b/common_level2.h
index 640d4a073..9a5ebb4d9 100644
--- a/common_level2.h
+++ b/common_level2.h
@@ -44,6 +44,10 @@
 extern "C" {
 #endif
 
+int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
+int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
 int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
 int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
diff --git a/common_linux.h b/common_linux.h
index 35f3fb658..5a1c4e150 100644
--- a/common_linux.h
+++ b/common_linux.h
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
 // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
         return 0;
 #else
-#if defined (LOONGSON3B)
-#if defined (__64BIT__)
-	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
-#else
-	return 0; //NULL Implementation on Loongson 3B 32bit.
-#endif
-#else
 //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
 //	unsigned long null_nodemask=0;
 	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 #endif
-#endif
 }
 
 static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
diff --git a/common_macro.h b/common_macro.h
index 54deed57c..c6ea1bfd9 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -646,10 +646,12 @@
 
 #elif defined(BFLOAT16)
 
-#define  D_TO_BF16_K    SBDTOBF16_K
-#define  D_BF16_TO_K    DBF16TOD_K
-#define  S_TO_BF16_K    SBSTOBF16_K
-#define  S_BF16_TO_K    SBF16TOS_K
+#define D_TO_BF16_K     SBDTOBF16_K
+#define D_BF16_TO_K     DBF16TOD_K
+#define S_TO_BF16_K     SBSTOBF16_K
+#define S_BF16_TO_K     SBF16TOS_K
+#define SBGEMV_N        SBGEMV_N_K
+#define SBGEMV_T        SBGEMV_T_K
 
 #define	AMAX_K			SAMAX_K
 #define	AMIN_K			SAMIN_K
diff --git a/common_mips64.h b/common_mips64.h
index a06edfe08..287459e7d 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -229,12 +229,7 @@ REALNAME: ;\
 
 #define BUFFER_SIZE     ( 32 << 21)
 
-#if defined(LOONGSON3A)
-#define PAGESIZE	(16UL << 10)
-#define FIXED_PAGESIZE	(16UL << 10)
-#endif
-
-#if defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PAGESIZE	(16UL << 10)
 #define FIXED_PAGESIZE	(16UL << 10)
 #endif
@@ -250,7 +245,7 @@ REALNAME: ;\
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
-#if defined(LOONGSON3A) || defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PREFETCHD_(x) ld $0, x
 #define PREFETCHD(x)  PREFETCHD_(x)
 #else
diff --git a/common_param.h b/common_param.h
index b50e4ff80..3e3ae06f8 100644
--- a/common_param.h
+++ b/common_param.h
@@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
   int    (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
   int    (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
 
-  int    (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
+  int    (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+  int    (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
   int    (*sbger_k)  (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 
   int    (*sbsymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
diff --git a/common_power.h b/common_power.h
index a61e4e28a..a49197fd7 100644
--- a/common_power.h
+++ b/common_power.h
@@ -849,6 +849,10 @@ Lmcount$lazy_ptr:
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
+#ifdef DYNAMIC_ARCH
+#undef BUFFER_SIZE
+#define BUFFER_SIZE (64 << 22)
+#endif
 
 #ifndef PAGESIZE
 #define PAGESIZE	( 4 << 10)
diff --git a/common_riscv64.h b/common_riscv64.h
new file mode 100644
index 000000000..27f385dfd
--- /dev/null
+++ b/common_riscv64.h
@@ -0,0 +1,98 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_RISCV64
+#define COMMON_RISCV64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#if defined(C910V)
+#include <riscv-vector.h>
+#endif
+
+#endif
diff --git a/common_sb.h b/common_sb.h
index 66968ab00..9976e812e 100644
--- a/common_sb.h
+++ b/common_sb.h
@@ -8,6 +8,8 @@
 #define SBDTOBF16_K         sbdtobf16_k
 #define SBF16TOS_K          sbf16tos_k
 #define DBF16TOD_K          dbf16tod_k
+#define SBGEMV_N_K          sbgemv_n
+#define SBGEMV_T_K          sbgemv_t
 
 #define	SBGEMM_ONCOPY		sbgemm_oncopy
 #define	SBGEMM_OTCOPY		sbgemm_otcopy
@@ -29,6 +31,8 @@
 #define SBDTOBF16_K         gotoblas -> sbdtobf16_k
 #define SBF16TOS_K          gotoblas -> sbf16tos_k
 #define DBF16TOD_K          gotoblas -> dbf16tod_k
+#define SBGEMV_N_K          gotoblas -> sbgemv_n
+#define SBGEMV_T_K          gotoblas -> sbgemv_t
 
 #define	SBGEMM_ONCOPY		gotoblas -> sbgemm_oncopy
 #define	SBGEMM_OTCOPY		gotoblas -> sbgemm_otcopy
diff --git a/common_sparc.h b/common_sparc.h
index 85e29fffa..90a24ebf1 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -78,6 +78,12 @@ static __inline unsigned long rpcc(void){
 #define __BIG_ENDIAN__
 #endif
 
+#ifdef C_SUN
+#ifndef __64BIT
+#define RETURN_BY_STACK
+#endif
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res)  : : "memory")
 #else
diff --git a/cpuid_mips64.c b/cpuid_mips64.c
index 0c19ac1e7..674b65908 100644
--- a/cpuid_mips64.c
+++ b/cpuid_mips64.c
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_UNKNOWN     0
-#define CPU_SICORTEX    1
-#define CPU_LOONGSON3A  2
-#define CPU_LOONGSON3B  3
-#define CPU_I6400       4
-#define CPU_P6600       5
-#define CPU_I6500       6
+#define CPU_UNKNOWN      0
+#define CPU_SICORTEX     1
+#define CPU_LOONGSON3R3  2
+#define CPU_LOONGSON3R4  3
+#define CPU_I6400        4
+#define CPU_P6600        5
+#define CPU_I6500        6
 
 static char *cpuname[] = {
   "UNKNOWN",
   "SICORTEX",
-  "LOONGSON3A",
-  "LOONGSON3B",
+  "LOONGSON3R3",
+  "LOONGSON3R4",
   "I6400",
   "P6600",
   "I6500"
@@ -90,48 +90,13 @@ static char *cpuname[] = {
 
 int detect(void){
 
-#ifdef __linux
+#ifdef linux
   FILE *infile;
   char buffer[512], *p;
 
   p = (char *)NULL;
-  infile = fopen("/proc/cpuinfo", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("cpu", buffer, 3)){
-	p = strchr(buffer, ':') + 2;
-#if 0
-	fprintf(stderr, "%s\n", p);
-#endif
-	break;
-      }
-  }
-
-  fclose(infile);
-
-  if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }else if (strstr(p, "Loongson-3")){
-    infile = fopen("/proc/cpuinfo", "r");
-    p = (char *)NULL;
-    while (fgets(buffer, sizeof(buffer), infile)){
-      if (!strncmp("system type", buffer, 11)){
-	p = strchr(buffer, ':') + 2;
-	break;
-      }
-    }
-    fclose(infile);
-    if (strstr(p, "loongson3a"))
-      return CPU_LOONGSON3A;
-  }else{
-    return CPU_SICORTEX;
-  }
-  }
   //Check model name for Loongson3
   infile = fopen("/proc/cpuinfo", "r");
-  p = (char *)NULL;
   while (fgets(buffer, sizeof(buffer), infile)){
     if (!strncmp("model name", buffer, 10)){
       p = strchr(buffer, ':') + 2;
@@ -140,14 +105,16 @@ int detect(void){
   }
   fclose(infile);
   if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }
+  if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
+    return CPU_LOONGSON3R3;
+  }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
+    return CPU_LOONGSON3R4;
+  } else{
+    return CPU_SICORTEX;
   }
 #endif
     return CPU_UNKNOWN;
+  }
 }
 
 char *get_corename(void){
@@ -159,10 +126,10 @@ void get_architecture(void){
 }
 
 void get_subarchitecture(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("LOONGSON3A");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("LOONGSON3B");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("LOONGSON3R3");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("LOONGSON3R4");
   }else if(detect()==CPU_I6400){
     printf("I6400");
   }else if(detect()==CPU_P6600){
@@ -179,8 +146,8 @@ void get_subdirname(void){
 }
 
 void get_cpuconfig(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("#define LOONGSON3A\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("#define LOONGSON3R3\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -188,8 +155,8 @@ void get_cpuconfig(void){
     printf("#define DTB_DEFAULT_ENTRIES 64\n");
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 4\n");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("#define LOONGSON3B\n");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("#define LOONGSON3R4\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -237,10 +204,10 @@ void get_cpuconfig(void){
 }
 
 void get_libname(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("loongson3a\n");
-  }else if(detect()==CPU_LOONGSON3B) {
-    printf("loongson3b\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("loongson3r3\n");
+  }else if(detect()==CPU_LOONGSON3R4) {
+    printf("loongson3r4\n");
   }else if(detect()==CPU_I6400) {
     printf("i6400\n");
   }else if(detect()==CPU_P6600) {
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
new file mode 100644
index 000000000..0eb50e001
--- /dev/null
+++ b/cpuid_riscv64.c
@@ -0,0 +1,113 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define CPU_UNKNOWN     0
+#define CPU_C910V       1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "C910V"
+};
+
+int detect(void){
+    return CPU_UNKNOWN;
+}
+
+char *get_corename(void){
+  return cpuname[detect()];
+}
+
+void get_architecture(void){
+  printf("RISCV64");
+}
+
+void get_subarchitecture(void){
+}
+
+void get_subdirname(void){
+  printf("riscv64");
+}
+
+void get_cpuconfig(void){
+  printf("#define UNKNOWN\n");
+  printf("#define L1_DATA_SIZE 65536\n");
+  printf("#define L1_DATA_LINESIZE 32\n");
+  printf("#define L2_SIZE 512488\n");
+  printf("#define L2_LINESIZE 32\n");
+  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+  printf("#define DTB_SIZE 4096\n");
+  printf("#define L2_ASSOCIATIVE 4\n");
+}
+
+void get_libname(void){
+  printf("riscv64\n");
+}
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 728d459d1..84c12ff43 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -202,7 +202,7 @@ int support_avx(){
   if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
     xgetbv(0, &eax, &edx);
     if((eax & 6) == 6){
-      ret=1;  //OS support AVX
+      ret=1;  //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2))
     }
   }
   return ret;
@@ -219,8 +219,8 @@ int support_avx2(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //CPU supports AVX2
   return ret;
 #else
   return 0;
@@ -235,14 +235,14 @@ int support_avx512(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & 32) != 32){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag
     xgetbv(0, &eax, &edx); 
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
-  }
+      ret=1;  //OS supports saving zmm registers
+ }
   return ret;
 #else
   return 0;
diff --git a/ctest.c b/ctest.c
index cd84ab1bb..d674a8cbd 100644
--- a/ctest.c
+++ b/ctest.c
@@ -153,6 +153,11 @@ ARCH_ARM
 ARCH_ARM64
 #endif
 
+#if defined(__riscv)
+ARCH_RISCV64
+#endif
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 HAVE_C11
 #endif
+
diff --git a/ctest/Makefile b/ctest/Makefile
index cba904f75..2a893cae8 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -61,7 +61,7 @@ endif
 
 all1: $(all1targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat1
@@ -106,7 +106,7 @@ endif
 
 all2: $(all2targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat2 < sin2
@@ -152,7 +152,7 @@ endif
 
 all3: $(all3targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat3 < sin3
diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index 7212d6662..caecf4f97 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -413,7 +413,13 @@ XBLASOBJS   += \
 	xtbmv_thread_RUU.$(SUFFIX)	xtbmv_thread_RUN.$(SUFFIX) \
 	xtbmv_thread_RLU.$(SUFFIX)	xtbmv_thread_RLN.$(SUFFIX) \
 	xtbmv_thread_CUU.$(SUFFIX)	xtbmv_thread_CUN.$(SUFFIX) \
-	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX) \
+	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX)
+
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_thread_t$(TSUFFIX).$(SUFFIX)
+endif
 
 endif
 
@@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX)  xtrsv_CUU.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 xtrsv_CUN.$(SUFFIX)  xtrsv_CUN.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 	$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
+sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
+endif
+
+
 include ../../Makefile.tail
diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c
new file mode 100644
index 000000000..534c60f95
--- /dev/null
+++ b/driver/level2/sbgemv_thread.c
@@ -0,0 +1,149 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#ifndef TRANSA
+#define SBGEMV	SBGEMV_N
+#else
+#define SBGEMV	SBGEMV_T
+#endif
+
+static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){
+
+    bfloat16 *a, *x;
+    float    *y;
+    BLASLONG lda, incx, incy;
+    BLASLONG m_from, m_to, n_from, n_to;
+
+    a = (bfloat16 *)args->a;
+    x = (bfloat16 *)args->b;
+    y = (float *)args->c;
+
+    lda  = args->lda;
+    incx = args->ldb;
+    incy = args->ldc;
+    
+#ifndef TRANSA          // N
+    m_from = *(range_m + 0);
+    m_to   = *(range_m + 1);
+    n_from = 0;
+    n_to   = args -> n;
+    a += m_from;
+    y += m_from * incy;
+#else                   // T
+    m_from = 0;
+    m_to   = args->m;
+    n_from = *(range_n + 0);
+    n_to   = *(range_n + 1);
+    a += n_from * lda;
+    y += n_from * incy;
+#endif
+
+    SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy);
+
+    return 0;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads)
+{
+    blas_arg_t args;
+    blas_queue_t queue[MAX_CPU_NUMBER];
+    BLASLONG range[MAX_CPU_NUMBER + 1];
+
+#ifndef TRANSA
+    BLASLONG width_for_split = m;
+#else
+    BLASLONG width_for_split = n;
+#endif
+
+    BLASLONG BLOCK_WIDTH = width_for_split/threads;
+
+    int mode  =  BLAS_BFLOAT16  | BLAS_REAL;
+
+    args.m     = m;
+    args.n     = n;
+    args.a     = (void *)a;
+    args.b     = (void *)x;
+    args.c     = (void *)y;
+    args.lda   = lda;
+    args.ldb   = incx;
+    args.ldc   = incy;
+    args.alpha = (void *)&alpha;
+    args.beta  = (void *)&beta;
+
+    range[0] = 0;
+
+    int thread_idx;
+
+    for (thread_idx=0; thread_idx<threads; thread_idx++) {
+        if (thread_idx != threads-1) {
+            range[thread_idx + 1] = range[thread_idx] + BLOCK_WIDTH;
+        } else {
+            range[thread_idx + 1] = range[thread_idx] + width_for_split;
+        }
+
+        queue[thread_idx].mode    = mode;
+        queue[thread_idx].routine = sbgemv_kernel;
+        queue[thread_idx].args    = &args;
+#ifndef TRANSA
+        queue[thread_idx].range_m = &range[thread_idx];
+        queue[thread_idx].range_n = NULL;
+#else
+        queue[thread_idx].range_m = NULL;
+        queue[thread_idx].range_n = &range[thread_idx];
+#endif
+        queue[thread_idx].sa      = NULL;
+        queue[thread_idx].sb      = NULL;
+        queue[thread_idx].next    = &queue[thread_idx + 1];
+
+        width_for_split -= BLOCK_WIDTH;
+    }
+
+    if (thread_idx) {
+        queue[0].sa = NULL;
+        queue[0].sb = NULL;
+        queue[thread_idx - 1].next = NULL;
+
+        exec_blas(thread_idx, queue);
+    }
+
+    return 0;
+}
diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index b528dfa2d..78f32b961 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -206,7 +206,7 @@ ifdef SMP
 COMMONOBJS  += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX)
 COMMONOBJS  += syrk_thread.$(SUFFIX)
 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS    += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
 endif
@@ -282,7 +282,7 @@ HPLOBJS = \
 	dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \
 	dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX)
 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \
 	   dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 endif
@@ -297,13 +297,13 @@ ifeq ($(BUILD_DOUBLE),1)
 	strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \
 	ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \
 	ssyrk_kernel_U.$(SUFFIX)  ssyrk_kernel_L.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 SBLASOBJS    += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX)
 endif
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	SBLASOBJS = sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 SBLASOBJS    += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
 endif
 endif
@@ -312,7 +312,7 @@ ifneq ($(BUILD_DOUBLE),1)
 	DBLASOBJS=
 ifeq ($(BUILD_COMPLEX16),1)
 	DBLASOBJS = dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 DBLASOBJS    += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 endif
 endif
@@ -332,7 +332,7 @@ ifeq ($(BUILD_COMPLEX16),1)
 	ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \
 	ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \
 	ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 CBLASOBJS    += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX)
 endif
 endif
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index a38506585..9b44deb85 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
-        	if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
+/*
+		if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
         	else
+*/
           		if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
 
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 6e1fd9e99..2b33c9589 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
 	if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
 	else
+/*
           if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
           else
+*/
             if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
         /* Copy part of local region of B into workspace */
diff --git a/driver/others/Makefile b/driver/others/Makefile
index 7558ec058..4a421ef31 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -7,7 +7,7 @@ COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
 
 ifdef SMP
 COMMONOBJS	+= blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
-ifndef NO_AFFINITY
+ifneq ($(NO_AFFINITY), 1)
 COMMONOBJS	+= init.$(SUFFIX)
 endif
 endif
@@ -24,19 +24,23 @@ else
 ifeq ($(ARCH),zarch)
 COMMONOBJS += dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+COMMONOBJS += dynamic_mips64.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
 
-ifdef EXPRECISION
+ifeq ($(EXPRECISION), 1)
 COMMONOBJS	+= x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX)
 endif
 
-ifdef QUAD_PRECISION
+ifeq ($(QUAD_PRECISION), 1)
 COMMONOBJS	+= addx.$(SUFFIX) mulx.$(SUFFIX)
 endif
 
@@ -46,11 +50,9 @@ ifeq ($(C_COMPILER), PGI)
 endif
 endif
 
-ifdef USE_CUDA
 ifeq ($(USE_CUDA), 1)
 COMMONOBJS	+= cuda_init.$(SUFFIX)
 endif
-endif
 
 ifdef FUNCTION_PROFILE
 COMMONOBJS	+= profile.$(SUFFIX)
@@ -94,10 +96,14 @@ else
 ifeq ($(ARCH),zarch)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 04acbcc5f..06039c952 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
     break;
   }
 
-  mode |= BLAS_LEGACY;
+  if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
 
   for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
 
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 30e0cc6c2..5e0943c2e 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) {
   blas_cpu_number  = num_threads;
 
 #if defined(ARCH_MIPS64)
+#ifndef DYNAMIC_ARCH
   //set parameters for different number of threads.
   blas_set_parameter();
 #endif
+#endif
 
 }
 
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index d546553c1..a576127aa 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #endif
 
-void goto_set_num_threads(int num_threads) {
+static void adjust_thread_buffers() {
 
   int i=0, j=0;
 
+  //adjust buffer for each thread
+  for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
+    for(j=0; j < blas_cpu_number; j++){
+      if(blas_thread_buffer[i][j] == NULL){
+        blas_thread_buffer[i][j] = blas_memory_alloc(2);
+      }
+    }
+    for(; j < MAX_CPU_NUMBER; j++){
+      if(blas_thread_buffer[i][j] != NULL){
+        blas_memory_free(blas_thread_buffer[i][j]);
+        blas_thread_buffer[i][j] = NULL;
+      }
+    }
+  }
+}
+
+void goto_set_num_threads(int num_threads) {
+
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) {
 
   omp_set_num_threads(blas_cpu_number);
 
-  //adjust buffer for each thread
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_cpu_number; j++){
-      if(blas_thread_buffer[i][j]==NULL){
-        blas_thread_buffer[i][j]=blas_memory_alloc(2);
-      }
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      if(blas_thread_buffer[i][j]!=NULL){
-        blas_memory_free(blas_thread_buffer[i][j]);
-        blas_thread_buffer[i][j]=NULL;
-      }
-    }
-  }
+  adjust_thread_buffers();
 #if defined(ARCH_MIPS64)
   //set parameters for different number of threads.
   blas_set_parameter();
@@ -119,20 +124,11 @@ void openblas_set_num_threads(int num_threads) {
 
 int blas_thread_init(void){
 
-  int i=0, j=0;
-
   blas_get_cpu_number();
 
-  blas_server_avail = 1;
+  adjust_thread_buffers();
 
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_num_threads; j++){
-      blas_thread_buffer[i][j]=blas_memory_alloc(2);
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      blas_thread_buffer[i][j]=NULL;
-    }
-  }
+  blas_server_avail = 1;
 
   return 0;
 }
@@ -352,7 +348,6 @@ fprintf(stderr,"UNHANDLED COMPLEX\n");
           /* Other types in future */
 	  }
       }
-if (!sb) fprintf(stderr,"SB not declared!!!\n");
       queue->sb=sb;
     }
   }
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 4624085d5..42f289441 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   routine = queue -> routine;
 
-    if (!(queue -> mode & BLAS_LEGACY)) {
+  if (queue -> mode & BLAS_LEGACY) {
+    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
+  } else
+    if (queue -> mode & BLAS_PTHREAD) {
+      void (*pthreadcompat)(void *) = queue -> routine;
+      (pthreadcompat)(queue -> args);
+    } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
 		queue -> sa, queue -> sb, 0);
-    } else {
-      legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-    }
 
   if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
 
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 21d2c7948..58f4d8b59 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -330,8 +330,8 @@ int support_avx2(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //AVX2 flag is set
   return ret;
 #else
   return 0;
@@ -346,13 +346,13 @@ int support_avx512(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) == 0){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1u<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
     xgetbv(0, &eax, &edx);
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
+      ret=1;  //OS supports saving zmm register
   }
   return ret;
 #else
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index be22b247c..4f1b12f27 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) {
 
 static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
+  char coremsg[128];
+
+#if (!defined OS_LINUX && !defined OS_ANDROID)
+  return NULL;
+#else
 
-#if (defined OS_LINUX || defined OS_ANDROID)
   if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
-    char coremsg[128];
+#ifdef __linux
+        FILE *infile;
+        char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
+        p = (char *) NULL ;
+	infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
+	if (!infile) return NULL;
+	fgets(buffer, sizeof(buffer), infile);
+	midr_el1=strtoul(buffer,NULL,16);
+	fclose(infile);
+#else
     snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
     openblas_warning(1, coremsg);
     return NULL;
-  }
-#else
-   return NULL;
 #endif
-
-  get_cpu_ftr(MIDR_EL1, midr_el1);
+  } else {
+    get_cpu_ftr(MIDR_EL1, midr_el1);
+  }
   /*
    * MIDR_EL1
    *
@@ -219,8 +230,12 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    default:
+      snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
+      openblas_warning(1, coremsg);
   }
   return NULL;
+#endif
 }
 
 void gotoblas_dynamic_init(void) {
diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c
new file mode 100644
index 000000000..9fd19d739
--- /dev/null
+++ b/driver/others/dynamic_mips64.c
@@ -0,0 +1,230 @@
+/*****************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#include <sys/wait.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include "common.h"
+
+extern gotoblas_t  gotoblas_LOONGSON3R3;
+extern gotoblas_t  gotoblas_LOONGSON3R4;
+
+extern void openblas_warning(int verbose, const char * msg);
+
+#define NUM_CORETYPES    2
+
+static char *corename[] = {
+  "loongson3r3",
+  "loongson3r4",
+  "UNKNOWN"
+};
+
+char *gotoblas_corename(void) {
+  if (gotoblas == &gotoblas_LOONGSON3R3)    return corename[0];
+  if (gotoblas == &gotoblas_LOONGSON3R4)    return corename[1];
+  return corename[NUM_CORETYPES];
+}
+
+static gotoblas_t *force_coretype(char *coretype) {
+  int i;
+  int found = -1;
+  char message[128];
+
+  for ( i=0 ; i < NUM_CORETYPES; i++)
+  {
+    if (!strncasecmp(coretype, corename[i], 20))
+    {
+        found = i;
+        break;
+    }
+  }
+
+  switch (found)
+  {
+    case  0: return (&gotoblas_LOONGSON3R3);
+    case  1: return (&gotoblas_LOONGSON3R4);
+  }
+  snprintf(message, 128, "Core not found: %s\n", coretype);
+  openblas_warning(1, message);
+  return NULL;
+}
+
+#define MMI_MASK    0x00000010
+#define MSA_MASK    0x00000020
+
+int fd[2];
+int support_cpucfg;
+
+static void handler(int signum)
+{
+    close(fd[1]);
+    exit(1);
+}
+
+/* Brief :  Function to check if cpucfg supported on loongson
+ * Return:  1   supported
+ *          0   not supported
+ */
+static int cpucfg_test(void) {
+    pid_t pid;
+    int status = 0;
+
+    support_cpucfg = 0;
+    pipe(fd);
+    pid = fork();
+    if (pid == 0) { /* Subprocess */
+        struct sigaction act;
+        close(fd[0]);
+        /* Set signal action for SIGILL. */
+        act.sa_handler = handler;
+        sigaction(SIGILL,&act,NULL);
+
+        /* Execute cpucfg in subprocess. */
+        __asm__ volatile(
+            ".insn              \n\t"
+            ".word (0xc8080118) \n\t"
+            :::
+        );
+        support_cpucfg = 1;
+        write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
+        close(fd[1]);
+        exit(0);
+    } else if (pid > 0){ /* Parent process*/
+        close(fd[1]);
+        if ((waitpid(pid,&status,0) <= 0) ||
+            (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
+            support_cpucfg = 0;
+        close(fd[0]);
+    } else {
+        support_cpucfg = 0;
+    }
+
+    return support_cpucfg;
+}
+
+static gotoblas_t *get_coretype_from_cpucfg(void) {
+    int flag = 0;
+    __asm__ volatile(
+        ".insn                     \n\t"
+        "dli    $8,    0x01        \n\t"
+        ".word (0xc9084918)        \n\t"
+        "usw    $9,    0x00(%0)    \n\t"
+        :
+        : "r"(&flag)
+        : "memory"
+    );
+    if (flag & MSA_MASK)
+        return (&gotoblas_LOONGSON3R4);
+    if (flag & MMI_MASK)
+        return (&gotoblas_LOONGSON3R3);
+    return NULL;
+}
+
+static gotoblas_t *get_coretype_from_cpuinfo(void) {
+#ifdef linux
+  FILE *infile;
+  char buffer[512], *p;
+
+  p = (char *)NULL;
+  //Check model name for Loongson3
+  infile = fopen("/proc/cpuinfo", "r");
+  while (fgets(buffer, sizeof(buffer), infile)){
+    if (!strncmp("model name", buffer, 10)){
+      p = strchr(buffer, ':') + 2;
+      break;
+    }
+  }
+  fclose(infile);
+  if(p != NULL){
+   if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
+     return (&gotoblas_LOONGSON3R3);
+   else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
+     return (&gotoblas_LOONGSON3R4);
+   else
+     return NULL;
+  }
+#endif
+    return NULL;
+}
+
+static gotoblas_t *get_coretype(void) {
+    int ret = 0;
+
+    ret = cpucfg_test();
+    if (ret == 1)
+        return get_coretype_from_cpucfg();
+    else
+        return get_coretype_from_cpuinfo();
+}
+
+void gotoblas_dynamic_init(void) {
+  char coremsg[128];
+  char coren[22];
+  char *p;
+
+  if (gotoblas) return;
+
+  p = getenv("OPENBLAS_CORETYPE");
+  if ( p )
+  {
+    gotoblas = force_coretype(p);
+  }
+  else
+  {
+    gotoblas = get_coretype();
+  }
+
+  if (gotoblas == NULL)
+  {
+    snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
+    openblas_warning(1, coremsg);
+    gotoblas = &gotoblas_LOONGSON3R3;
+  }
+
+  if (gotoblas && gotoblas->init) {
+    strncpy(coren, gotoblas_corename(), 20);
+    sprintf(coremsg, "Core: %s\n", coren);
+    openblas_warning(2, coremsg);
+    gotoblas -> init();
+  } else {
+    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+    exit(1);
+  }
+
+}
+
+void gotoblas_dynamic_quit(void) {
+  gotoblas = NULL;
+}
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 85fc5b3ba..a2f56d839 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -52,6 +52,11 @@ static gotoblas_t *get_coretype(void) {
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 		return &gotoblas_POWER10;
 #endif
+	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+	if (__builtin_cpu_is("power10"))
+		return &gotoblas_POWER9;
+#endif	
 	return NULL;
 }
 
diff --git a/driver/others/memory.c b/driver/others/memory.c
index ba2bb55b9..f0521ab2d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1767,11 +1767,11 @@ int get_num_procs(void);
 int get_num_procs(void) {
 
   static int nums = 0;
+
+#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
   int ret;
-
-#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 35fc0a253..36da13369 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -717,7 +717,7 @@ void blas_set_parameter(void){
 
 #if defined(ARCH_MIPS64)
 void blas_set_parameter(void){
-#if defined(LOONGSON3A)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #ifdef SMP
   if(blas_num_threads == 1){
 #endif
@@ -731,20 +731,6 @@ void blas_set_parameter(void){
 #endif
 #endif
 
-#if defined(LOONGSON3B)
-#ifdef SMP
-  if(blas_num_threads == 1 || blas_num_threads == 2){
-#endif
-    //single thread
-    dgemm_r = 640;
-#ifdef SMP
-  }else{
-    //multi thread
-    dgemm_r = 160;
-  }
-#endif
-#endif
-
 }
 #endif
 
diff --git a/exports/gensymbol b/exports/gensymbol
index 22e470da5..857a17a9e 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -51,7 +51,7 @@
     zgeadd, dzsum);
 
 @blasobjs = (lsame, xerbla);
-@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
+@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
 @cblasobjsc = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
     cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -94,7 +94,7 @@
 
 @cblasobjs = (  cblas_xerbla );
 
-@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
+@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
 
 @exblasobjs = (
     qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
diff --git a/f_check b/f_check
index f894aa9ac..42241ae10 100644
--- a/f_check
+++ b/f_check
@@ -33,7 +33,7 @@ if ($compiler eq "") {
               "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
 	      "pathf90", "pathf95",
 	      "pgf95", "pgf90", "pgf77",
-	      "flang",
+	      "flang", "egfortran",
               "ifort");
 
 OUTER:
@@ -69,7 +69,12 @@ if ($compiler eq "") {
 	    $bu       = "_";
 	}
 
-	if ($data =~ /GNU/ || $data =~ /GCC/ ) {
+	if ($data =~ /Fujitsu/) {
+
+	    $vendor = FUJITSU;
+	    $openmp = "-Kopenmp";
+
+	} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
 
 	    $data =~ /(\d+)\.(\d+).(\d+)/;
 	    $major = $1;
@@ -325,6 +330,9 @@ if ($link ne "") {
 	    $flags =~ s/\@/\,/g;
 	    $linker_L .= "-Wl,". $flags . " " ;
 	}
+	if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
+	    $flags = "-lomp";
+	}
 
 	if (
 	    ($flags =~ /^\-l/)
@@ -337,8 +345,8 @@ if ($link ne "") {
 	    && ($flags !~ /kernel32/)
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
-	    && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
-	    && ($flags !~ /[0-9]+/)
+	    && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
+	    && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
 		&& ($flags !~ /^\-l$/)
 	    ) {
 	    $linker_l .= $flags . " ";
diff --git a/getarch.c b/getarch.c
index 3f1448305..29671736e 100644
--- a/getarch.c
+++ b/getarch.c
@@ -97,9 +97,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__x86_64__) || defined(_M_X64)
 #if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
 #else
+#ifndef NO_AVX512
 #define NO_AVX512
 #endif
 #endif
+#endif
 /* #define FORCE_P2		*/
 /* #define FORCE_KATMAI		*/
 /* #define FORCE_COPPERMINE	*/
@@ -138,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_PPC440FP2	*/
 /* #define FORCE_CELL		*/
 /* #define FORCE_SICORTEX	*/
-/* #define FORCE_LOONGSON3A	*/
-/* #define FORCE_LOONGSON3B	*/
+/* #define FORCE_LOONGSON3R3	*/
+/* #define FORCE_LOONGSON3R4	*/
 /* #define FORCE_I6400		*/
 /* #define FORCE_P6600		*/
 /* #define FORCE_P5600		*/
@@ -324,16 +326,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #endif
+#endif
 
 #ifdef FORCE_SKYLAKEX
 #ifdef NO_AVX512
@@ -346,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -359,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
 #define LIBNAME   "skylakex"
 #define CORENAME  "SKYLAKEX"
 #endif
@@ -376,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -389,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
 #define LIBNAME   "cooperlake"
 #define CORENAME  "COOPERLAKE"
 #endif
@@ -549,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#else
 #define SUBARCHITECTURE "ZEN"
 #define ARCHCONFIG   "-DZEN " \
 		     "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
@@ -559,10 +582,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-		     "-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
+		     "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "zen"
 #define CORENAME  "ZEN"
 #endif
+#endif
 
 
 #ifdef FORCE_SSE_GENERIC
@@ -790,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-#ifdef FORCE_LOONGSON3A
+#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3A"
+#define SUBARCHITECTURE "LOONGSON3R3"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3A " \
+#define ARCHCONFIG   "-DLOONGSON3R3 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3a"
-#define CORENAME  "LOONGSON3A"
+#define LIBNAME   "loongson3r3"
+#define CORENAME  "LOONGSON3R3"
 #else
 #endif
 
-#ifdef FORCE_LOONGSON3B
+#ifdef FORCE_LOONGSON3R4
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3B"
+#define SUBARCHITECTURE "LOONGSON3R4"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3B " \
+#define ARCHCONFIG   "-DLOONGSON3R4 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3b"
-#define CORENAME  "LOONGSON3B"
+#define LIBNAME   "loongson3r4"
+#define CORENAME  "LOONGSON3R4"
 #else
 #endif
 
@@ -981,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_GENERIC
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1266,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
+#ifdef FORCE_C910V
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "C910V"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DC910V " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "c910v"
+#define CORENAME  "C910V"
+#else
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1320,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED
@@ -1405,8 +1462,41 @@ int main(int argc, char *argv[]){
 
     printf("NUM_CORES=%d\n", get_num_cores());
 
-#if defined(__arm__) && !defined(FORCE)
+#if defined(__arm__) 
+#if !defined(FORCE)
+    fprintf(stderr,"get features!\n");
         get_features();
+#else
+    fprintf(stderr,"split archconfig!\n");
+    sprintf(buffer, "%s", ARCHCONFIG);
+
+    p = &buffer[0];
+
+    while (*p) {
+      if ((*p == '-') && (*(p + 1) == 'D')) {
+	p += 2;
+        if (*p != 'H') {
+		while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; }
+		if (*p == '-') continue;
+	}
+	while ((*p != ' ') && (*p != '\0')) {
+
+	  if (*p == '=') {
+	    printf("=");
+	    p ++;
+	    while ((*p != ' ') && (*p != '\0')) {
+	      printf("%c", *p);
+	      p ++;
+	    }
+	  } else {
+	    printf("%c", *p);
+	    p ++;
+	    if ((*p == ' ') || (*p =='\0')) printf("=1\n");
+	  }
+	}
+      } else p ++;
+    }
+#endif
 #endif
 
 
diff --git a/interface/Makefile b/interface/Makefile
index 6b247b49f..597956fdb 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -19,7 +19,7 @@ ifeq ($(ARCH), MIPS)
 SUPPORT_GEMM3M = 1
 endif
 
-ifndef NO_FBLAS
+ifneq ($(NO_FBLAS), 1)
 
 SBLAS1OBJS    = \
 		saxpy.$(SUFFIX) sswap.$(SUFFIX) \
@@ -48,6 +48,7 @@ SBLAS3OBJS    = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLAS1OBJS    = sbdot.$(SUFFIX)
+SBBLAS2OBJS    = sbgemv.$(SUFFIX)
 SBBLAS3OBJS    = sbgemm.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif
@@ -145,7 +146,7 @@ ZBLAS3OBJS   +=  zgemm3m.$(SUFFIX)
 
 endif
 
-ifdef EXPRECISION
+ifeq ($(EXPRECISION), 1)
 
 QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
@@ -284,6 +285,7 @@ CSBLAS3OBJS   = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
+CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
 CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif
@@ -382,6 +384,7 @@ SBLAS1OBJS   += $(CSBLAS1OBJS)
 SBLAS2OBJS   += $(CSBLAS2OBJS)
 SBLAS3OBJS   += $(CSBLAS3OBJS)
 SBBLAS1OBJS  += $(CSBBLAS1OBJS)
+SBBLAS2OBJS  += $(CSBBLAS2OBJS)
 SBBLAS3OBJS  += $(CSBBLAS3OBJS)
 DBLAS1OBJS   += $(CDBLAS1OBJS)
 DBLAS2OBJS   += $(CDBLAS2OBJS)
@@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ)
 endif
 
 SBLASOBJS    = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
-SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
+SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
 DBLASOBJS    = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
 QBLASOBJS    = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
 CBLASOBJS    = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@@ -507,12 +510,12 @@ ifneq ($(BUILD_COMPLEX16),1)
 endif
 
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
-$(info FUNCOBJS = {[$(FUNCOBJS)]} )
-ifdef EXPRECISION
+
+ifeq ($(EXPRECISION), 1)
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif
 
-ifdef QUAD_PRECISION
+ifeq ($(QUAD_PRECISION), 1)
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif
 
@@ -538,7 +541,7 @@ clean ::
 level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
-level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
+level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
 level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) 
@@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
 xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
 	$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+endif
+
 ifndef USE_NETLIB_GEMV
 sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
 	$(CC) -c $(CFLAGS) -o $(@F) $<
@@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
 cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
+
 cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c
 	$(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $<
 
diff --git a/interface/gemv.c b/interface/gemv.c
index c9d52cd69..d5d739fb1 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order,
   }
 
 #endif
-  //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
   if ((m==0) || (n==0)) return;
 
   lenx = n;
diff --git a/interface/sbgemv.c b/interface/sbgemv.c
new file mode 100644
index 000000000..89debe82d
--- /dev/null
+++ b/interface/sbgemv.c
@@ -0,0 +1,210 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "l1param.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#define ERROR_NAME "SBGEMV "
+
+#ifdef SMP
+static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = {
+    sbgemv_thread_n, sbgemv_thread_t,
+};
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY)
+{
+    char trans = *TRANS;
+    blasint m = *M;
+    blasint n = *N;
+    blasint lda = *LDA;
+    blasint incx = *INCX;
+    blasint incy = *INCY;
+    float alpha = *ALPHA;
+    float beta  = *BETA;
+#ifdef SMP
+    int nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    blasint info;
+    blasint lenx, leny;
+    blasint i;
+
+    PRINT_DEBUG_NAME;
+
+    TOUPPER(trans);
+
+    info = 0;
+
+    i = -1;
+
+    if (trans == 'N') {i = 0;}
+    if (trans == 'T') {i = 1;}
+    if (trans == 'R') {i = 0;}
+    if (trans == 'C') {i = 1;}
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (i < 0)           {info = 1;}
+
+    trans = i;
+
+    if (info != 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy)
+{
+    blasint lenx,  leny;
+    int     trans;
+    blasint info,  t;
+#ifdef SMP
+    int     nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG,  bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    PRINT_DEBUG_CNAME;
+
+    trans = -1;
+    info  =  0;
+
+    if (order == CblasColMajor) {   // Column Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 0;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 1;
+        }
+    } else {                        // Row Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 1;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 0;
+        }
+
+        t = n;
+        n = m;
+        m = t;
+    }
+
+    info = -1;
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (trans < 0)       {info = 1;}
+
+    if (info >= 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#endif
+
+    if ((m==0) || (n==0)) return;
+
+    if (trans) {
+        lenx = m;
+        leny = n;
+    } else {
+        lenx = n;
+        leny = m;
+    }
+
+    if (alpha == ZERO) {
+        if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
+        return;
+    }
+
+    IDEBUG_START;
+    FUNCTION_PROFILE_START();
+
+    if (incx < 0) {x -= (lenx - 1) * incx;}
+    if (incy < 0) {y -= (leny - 1) * incy;}
+
+#ifdef SMP
+    int thread_thres_row = 20480;
+    if (trans) {
+        if (n <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    } else {
+        if (m <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    }
+
+
+    if (nthreads == 1) {
+#endif
+        (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy);
+#ifdef SMP
+    } else {
+        (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads);
+    }
+#endif
+
+    FUNCTION_PROFILE_END(1, m * n + m + n,  2 * m * n);
+    IDEBUG_END;
+
+    return;
+}
diff --git a/kernel/Makefile b/kernel/Makefile
index e52781c6d..4e86546b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,18 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifdef HAVE_SSE3
-CFLAGS += -msse3
-endif
-ifdef HAVE_SSSE3
-CFLAGS += -mssse3
-endif
-
-ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
@@ -26,20 +14,14 @@ endif
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
 endif
 ifeq ($(C_COMPILER), CLANG)
 # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
-  GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-  GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-  GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
-  GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+  GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
   ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
@@ -49,12 +31,6 @@ ifdef NO_AVX2
 endif
 
 ifdef TARGET_CORE
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
-	override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
-endif
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
-	override CFLAGS += -msse -msse2
-endif
 ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
  ifeq ($(GCCVERSIONGTEQ10), 1) 
@@ -82,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
  endif
 else ifeq ($(TARGET_CORE), HASWELL)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
+else ifeq ($(TARGET_CORE), LOONGSON3R4)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif
@@ -92,6 +70,9 @@ else
 TARGET_CORE = $(CORE)
 KDIR =
 TSUFFIX =
+ifeq ($(TARGET_CORE), LOONGSON3R4)
+  override CFLAGS += $(MSA_FLAGS)
+endif
 endif
 
 -include $(KERNELDIR)/KERNEL.$(TARGET_CORE)
diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index 79399c342..888a9b959 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL
 XGEMVTKERNEL = zgemv_t.S
 endif
 
+ifeq ($(BUILD_BFLOAT16),1)
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = ../x86_64/sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = ../x86_64/sbgemv_t.c
+endif
+endif
+
 ### GER ###
 
 ifndef SGERKERNEL
@@ -234,6 +244,12 @@ XBLASOBJS	+= \
 	xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
 	xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
 
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_t$(TSUFFIX).$(SUFFIX)
+endif
+
 ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
 $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
 	$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX  -UTRANS $< -o $@
@@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX)  : $(KER
 $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XHEMV_M_KERNEL)  ../symcopy.h
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
 
+ifeq ($(BUILD_BFLOAT16),1)
+$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+endif
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2ba593c2e..d8d739965 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -25,7 +25,7 @@ ifeq ($(ARCH), arm64)
 USE_TRMM = 1
 endif
 
-ifeq ($(TARGET), LOONGSON3B)
+ifeq ($(ARCH), riscv64)
 USE_TRMM = 1
 endif
 
diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c
index 63584b95c..a486a1868 100644
--- a/kernel/arm/sum.c
+++ b/kernel/arm/sum.c
@@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	n *= inc_x;
 	if (inc_x == 1)
 	{
-#if V_SIMD
+#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128))
 #ifdef DOUBLE
 		const int vstep = v_nlanes_f64;
-		const int unrollx2 = n & (-vstep * 2);
+		const int unrollx4 = n & (-vstep * 4);
 		const int unrollx = n & -vstep;
 		v_f64 vsum0 = v_zero_f64();
 		v_f64 vsum1 = v_zero_f64();
-		while (i < unrollx2)
-		{
-			vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
-			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
-			i += vstep * 2;
-		}
-		vsum0 = v_add_f64(vsum0, vsum1);
-		while (i < unrollx)
+		v_f64 vsum2 = v_zero_f64();
+		v_f64 vsum3 = v_zero_f64();
+		for (; i < unrollx4; i += vstep * 4)
+		{
+			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
+			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep));
+			vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2));
+			vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3));
+		}
+		vsum0 = v_add_f64(
+			v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3));
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f64(vsum0);
 #else
@@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		v_f32 vsum1 = v_zero_f32();
 		v_f32 vsum2 = v_zero_f32();
 		v_f32 vsum3 = v_zero_f32();
-		while (i < unrollx4)
+		for (; i < unrollx4; i += vstep * 4)
 		{
-			vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
-			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
-			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
-			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
-			i += vstep * 4;
+			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
+			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep));
+			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2));
+			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3));
 		}
 		vsum0 = v_add_f32(
 			v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
-		while (i < unrollx)
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f32(vsum0);
 #endif
diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index ba0e57eb5..9249b54f8 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 	dot[0]=0.0;
 	dot[1]=0.0;
-#if !defined(__PPC__)
+#if !defined(__PPC__) && !defined(__SunOS)
 	CREAL(result) = 0.0 ;
 	CIMAG(result) = 0.0 ;
 #else
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-#if !defined(__POWER__)	
+#if !defined(__PPC__)	&& !defined(__SunOS)
         CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 #else
diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c
new file mode 100644
index 000000000..7ea4e108c
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x4.c
@@ -0,0 +1,2092 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/4; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x4.c b/kernel/generic/trmmkernel_8x4.c
new file mode 100644
index 000000000..ae88e8d8f
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x4.c
@@ -0,0 +1,1317 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j=0; j<bn/4; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x8.c b/kernel/generic/trmmkernel_8x8.c
new file mode 100644
index 000000000..9add8b9d7
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x8.c
@@ -0,0 +1,2207 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT res4_0;
+    FLOAT res4_1;
+    FLOAT res4_2;
+    FLOAT res4_3;
+    FLOAT res4_4;
+    FLOAT res4_5;
+    FLOAT res4_6;
+    FLOAT res4_7;
+
+    FLOAT res5_0;
+    FLOAT res5_1;
+    FLOAT res5_2;
+    FLOAT res5_3;
+    FLOAT res5_4;
+    FLOAT res5_5;
+    FLOAT res5_6;
+    FLOAT res5_7;
+
+    FLOAT res6_0;
+    FLOAT res6_1;
+    FLOAT res6_2;
+    FLOAT res6_3;
+    FLOAT res6_4;
+    FLOAT res6_5;
+    FLOAT res6_6;
+    FLOAT res6_7;
+
+    FLOAT res7_0;
+    FLOAT res7_1;
+    FLOAT res7_2;
+    FLOAT res7_3;
+    FLOAT res7_4;
+    FLOAT res7_5;
+    FLOAT res7_6;
+    FLOAT res7_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+    
+    FLOAT b4;
+    FLOAT b5;
+    FLOAT b6;
+    FLOAT b7;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j=0; j<bn/8; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+	
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+            res4_4 = 0;
+            res4_5 = 0;
+            res4_6 = 0;
+            res4_7 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+            res5_4 = 0;
+            res5_5 = 0;
+            res5_6 = 0;
+            res5_7 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+            res6_4 = 0;
+            res6_5 = 0;
+            res6_6 = 0;
+            res6_7 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+            res7_4 = 0;
+            res7_5 = 0;
+            res7_6 = 0;
+            res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+                res4_4 += a0*b4;
+                res5_4 += a0*b5;
+                res6_4 += a0*b6;
+                res7_4 += a0*b7;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+                res4_5 += a1*b4;
+                res5_5 += a1*b5;
+                res6_5 += a1*b6;
+                res7_5 += a1*b7;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+                res4_6 += a0*b4;
+                res5_6 += a0*b5;
+                res6_6 += a0*b6;
+                res7_6 += a0*b7;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+                res4_7 += a1*b4;
+                res5_7 += a1*b5;
+                res6_7 += a1*b6;
+                res7_7 += a1*b7;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+            res4_4 *= alpha;
+            res4_5 *= alpha;
+            res4_6 *= alpha;
+            res4_7 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+            res5_4 *= alpha;
+            res5_5 *= alpha;
+            res5_6 *= alpha;
+            res5_7 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+            res6_4 *= alpha;
+            res6_5 *= alpha;
+            res6_6 *= alpha;
+            res6_7 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+            res7_4 *= alpha;
+            res7_5 *= alpha;
+            res7_6 *= alpha;
+            res7_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+            C4[4] = res4_4;
+            C4[5] = res4_5;
+            C4[6] = res4_6;
+            C4[7] = res4_7;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+            C5[4] = res5_4;
+            C5[5] = res5_5;
+            C5[6] = res5_6;
+            C5[7] = res5_7;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+            C6[4] = res6_4;
+            C6[5] = res6_5;
+            C6[6] = res6_6;
+            C6[7] = res6_7;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+            C7[4] = res7_4;
+            C7[5] = res7_5;
+            C7[6] = res7_6;
+            C7[7] = res7_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+            C4 = C4+8;
+            C5 = C5+8;
+            C6 = C6+8;
+            C7 = C7+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+            C4 = C4+4;
+            C5 = C5+4;
+            C6 = C6+4;
+            C7 = C7+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+		res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+		
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+            C4 = C4+2;
+            C5 = C5+2;
+            C6 = C6+2;
+            C7 = C7+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+            res4_0 = 0;
+            res5_0 = 0;
+            res6_0 = 0;
+            res7_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+	    res4_0 *= alpha;
+            res5_0 *= alpha;
+            res6_0 *= alpha;
+            res7_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+            C4[0] = res4_0;
+            C5[0] = res5_0;
+            C6[0] = res6_0;
+            C7[0] = res7_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+            C4 = C4+1;
+            C5 = C5+1;
+            C6 = C6+1;
+            C7 = C7+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+    if (bn&4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
index 4b3637c7c..aa3f1dcfa 100644
--- a/kernel/mips/cgemm_kernel_8x4_msa.c
+++ b/kernel/mips/cgemm_kernel_8x4_msa.c
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP2_INC(pa0, 4, src_a0, src_a1);                               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
@@ -758,10 +758,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     pc0[1] += alphar * res1;  \
     pc0[1] += alphai * res0;  \
                               \
-    pc1[2] += alphar * res2;  \
-    pc1[2] -= alphai * res3;  \
-    pc1[3] += alphar * res3;  \
-    pc1[3] += alphai * res2;  \
+    pc1[0] += alphar * res2;  \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] += alphar * res3;  \
+    pc1[1] += alphai * res2;  \
 }
 
 #define CGEMM_SCALE_1X1       \
@@ -1067,10 +1067,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     pc0[1] = alphar * res1;   \
     pc0[1] += alphai * res0;  \
                               \
-    pc1[2] = alphar * res2;   \
-    pc1[2] -= alphai * res3;  \
-    pc1[3] = alphar * res3;   \
-    pc1[3] += alphai * res2;  \
+    pc1[0] = alphar * res2;   \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] = alphar * res3;   \
+    pc1[1] += alphai * res2;  \
 }
 
 #define CGEMM_TRMM_SCALE_1X1  \
diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c
index 12fa7ca02..c1eb9bbfd 100644
--- a/kernel/mips/cgemv_n_msa.c
+++ b/kernel/mips/cgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c
index 584e3de75..800667b6e 100644
--- a/kernel/mips/cgemv_t_msa.c
+++ b/kernel/mips/cgemv_t_msa.c
@@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP1
 #undef OP2
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define CGEMV_T_8x4()                        \
diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c
index 5273e38a3..84eb54d6d 100644
--- a/kernel/mips/crot_msa.c
+++ b/kernel/mips/crot_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 2 elements */
             for (j = (n >> 1); j--;)
diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c
index 11a1450cf..451d0c921 100644
--- a/kernel/mips/cscal_msa.c
+++ b/kernel/mips/cscal_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c
index 6ce0375ab..2e41d8bef 100644
--- a/kernel/mips/dscal_msa.c
+++ b/kernel/mips/dscal_msa.c
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c
index 7b1f02477..67e97f710 100644
--- a/kernel/mips/dswap_msa.c
+++ b/kernel/mips/dswap_msa.c
@@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
     return (0);
 }
diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
index 9fb5141ca..e2cd3aa4b 100644
--- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
     ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
 
-    src_a54 = __msa_cast_to_vector_double(*(a + 54));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54));
     src_a62 = LD_DP(a + 62);
     src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
     src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a44 = LD_DP(a + 44);
     src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
     src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a + 36));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36));
 
     res_c7 *= src_a63;
     res_c6 -= res_c7 * src_a62;
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a26 = LD_DP(a + 26);
     src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
     src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a + 18));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18));
 
     res_c3 -= res_c7 * src_a59;
     res_c2 -= res_c7 * src_a58;
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a8 = LD_DP(a + 8);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     res_c1 -= res_c2 * src_a17;
     res_c1 *= src_a9;
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a52 = LD_DP(a - 12);
     src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
     src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
-    src_a54 = __msa_cast_to_vector_double(*(a - 10));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10));
 
     src_a40 = LD_DP(a - 24);
     src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a34 = LD_DP(a - 30);
     src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
     src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a - 28));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28));
 
     res_c4 *= src_a36;
     res_c3 -= res_c4 * src_a35;
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a16 = LD_DP(a - 48);
     src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
     src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a - 46));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a - 64));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64));
     src_a8 = LD_DP(a - 56);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
index 525fc8585..74cc1278a 100644
--- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c8 * src_a6;
     res_c15 -= res_c8 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c10 * src_a22;
     res_c15 -= res_c10 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c12 * src_a38;
     res_c15 -= res_c12 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
     ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c0 * src_a6;
     res_c7 -= res_c0 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c2 * src_a22;
     res_c7 -= res_c2 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a38;
     res_c7 -= res_c4 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c5 * src_a46;
     res_c7 -= res_c5 * src_a47;
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a2;
     res_c7 -= res_c4 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c2 -= res_c0 * src_a2;
     res_c3 -= res_c0 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
index cb361c511..03036f1c7 100644
--- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
         }
     }
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
index 581a90f71..4c55a0f37 100644
--- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 16;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
     a -= 8;
     b -= 1;
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 8;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h
index ee0dea0b7..b887800ed 100644
--- a/kernel/mips/macros_msa.h
+++ b/kernel/mips/macros_msa.h
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src)
 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
 
 #define COPY_FLOAT_TO_VECTOR(a) ( {                \
-    v4f32  out;                                    \
-    out = __msa_cast_to_vector_float(a);           \
-    out = (v4f32) __msa_splati_w((v4i32) out, 0);  \
+    v4f32  out = {a, a, a, a};                     \
     out;                                           \
 } )
 
 #define COPY_DOUBLE_TO_VECTOR(a) ( {               \
-    v2f64  out;                                    \
-    out = __msa_cast_to_vector_double(a);          \
-    out = (v2f64) __msa_splati_d((v2i64) out, 0);  \
+    v2f64  out = {a, a};                           \
     out;                                           \
 } )
 
diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c
index 75730241a..79d921b7a 100644
--- a/kernel/mips/srot_msa.c
+++ b/kernel/mips/srot_msa.c
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 4 floats */
             for (j = (n >> 2); j--;)
diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c
index 64b62d659..66e17b844 100644
--- a/kernel/mips/sscal_msa.c
+++ b/kernel/mips/sscal_msa.c
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 6); i--;)
             {
diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c
index 46fa8aa87..d412285b0 100644
--- a/kernel/mips/sswap_msa.c
+++ b/kernel/mips/sswap_msa.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
 
     return (0);
 }
diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c
index 669c25758..97a80b4ba 100644
--- a/kernel/mips/zgemv_n_msa.c
+++ b/kernel/mips/zgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c
index e6febb577..6492f90be 100644
--- a/kernel/mips/zgemv_t_msa.c
+++ b/kernel/mips/zgemv_t_msa.c
@@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP3
 #undef OP4
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define ZGEMV_T_8x1()                     \
diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c
index 5a8766d3c..a45c3cecd 100644
--- a/kernel/mips/zscal_msa.c
+++ b/kernel/mips/zscal_msa.c
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B
deleted file mode 100644
index e476c631e..000000000
--- a/kernel/mips64/KERNEL.LOONGSON3B
+++ /dev/null
@@ -1,64 +0,0 @@
-SAXPYKERNEL=axpy_loongson3a.S
-DAXPYKERNEL=daxpy_loongson3a_simd.S
-
-SGEMVNKERNEL = gemv_n_loongson3a.c
-SGEMVTKERNEL = gemv_t_loongson3a.c
-DGEMVNKERNEL = gemv_n_loongson3a.c
-DGEMVTKERNEL = gemv_t_loongson3a.c
-CGEMVNKERNEL = zgemv_n_loongson3a.c
-CGEMVTKERNEL = zgemv_t_loongson3a.c
-ZGEMVNKERNEL = zgemv_n_loongson3a.c
-ZGEMVTKERNEL = zgemv_t_loongson3a.c
-
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
-DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
-
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3
similarity index 75%
rename from kernel/mips64/KERNEL.LOONGSON3A
rename to kernel/mips64/KERNEL.LOONGSON3R3
index 0298faaad..904828d57 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3R3
@@ -16,32 +16,32 @@ SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  cgemm_incopy.o
-CGEMMITCOPYOBJ =  cgemm_itcopy.o
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 DSDOTKERNEL     = ../mips/dot.c
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
new file mode 100644
index 000000000..b81e5441d
--- /dev/null
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -0,0 +1,192 @@
+ifdef HAVE_MSA
+SAXPYKERNEL = ../mips/saxpy_msa.c
+DAXPYKERNEL = ../mips/daxpy_msa.c
+CAXPYKERNEL = ../mips/caxpy_msa.c
+ZAXPYKERNEL = ../mips/zaxpy_msa.c
+else
+SAXPYKERNEL = axpy_loongson3a.S
+DAXPYKERNEL = daxpy_loongson3a_simd.S
+endif
+
+ifdef HAVE_MSA
+SCOPYKERNEL  = ../mips/scopy_msa.c
+DCOPYKERNEL  = ../mips/dcopy_msa.c
+CCOPYKERNEL  = ../mips/ccopy_msa.c
+ZCOPYKERNEL  = ../mips/zcopy_msa.c
+endif
+
+ifdef HAVE_MSA
+SDOTKERNEL   = ../mips/sdot_msa.c
+DDOTKERNEL   = ../mips/ddot_msa.c
+CDOTKERNEL   = ../mips/cdot_msa.c
+ZDOTKERNEL   = ../mips/zdot_msa.c
+endif
+DSDOTKERNEL  = ../mips/dot.c
+
+ifdef HAVE_MSA
+SROTKERNEL   = ../mips/srot_msa.c
+DROTKERNEL   = ../mips/drot_msa.c
+CROTKERNEL   = ../mips/crot_msa.c
+ZROTKERNEL   = ../mips/zrot_msa.c
+endif
+
+ifdef HAVE_MSA
+SSCALKERNEL  = ../mips/sscal_msa.c
+DSCALKERNEL  = ../mips/dscal_msa.c
+CSCALKERNEL  = ../mips/cscal_msa.c
+ZSCALKERNEL  = ../mips/zscal_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMVNKERNEL = ../mips/sgemv_n_msa.c
+DGEMVNKERNEL = ../mips/dgemv_n_msa.c
+SGEMVTKERNEL = ../mips/sgemv_t_msa.c
+DGEMVTKERNEL = ../mips/dgemv_t_msa.c
+CGEMVNKERNEL = ../mips/cgemv_n_msa.c
+CGEMVTKERNEL = ../mips/cgemv_t_msa.c
+ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
+ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
+else
+SGEMVNKERNEL = gemv_n_loongson3a.c
+SGEMVTKERNEL = gemv_t_loongson3a.c
+DGEMVNKERNEL = gemv_n_loongson3a.c
+DGEMVTKERNEL = gemv_t_loongson3a.c
+CGEMVNKERNEL = zgemv_n_loongson3a.c
+CGEMVTKERNEL = zgemv_t_loongson3a.c
+ZGEMVNKERNEL = zgemv_n_loongson3a.c
+ZGEMVTKERNEL = zgemv_t_loongson3a.c
+endif
+
+ifdef HAVE_MSA
+SASUMKERNEL  = ../mips/sasum_msa.c
+DASUMKERNEL  = ../mips/dasum_msa.c
+CASUMKERNEL  = ../mips/casum_msa.c
+ZASUMKERNEL  = ../mips/zasum_msa.c
+endif
+
+ifdef HAVE_MSA
+SSWAPKERNEL  = ../mips/sswap_msa.c
+DSWAPKERNEL  = ../mips/dswap_msa.c
+CSWAPKERNEL  = ../mips/cswap_msa.c
+ZSWAPKERNEL  = ../mips/zswap_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
+SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
+SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+SGEMMKERNEL    =  sgemm_kernel_8x4_ps.S
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
+DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
+DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
+DGEMMONCOPY    = ../mips/dgemm_ncopy_4_msa.c
+DGEMMOTCOPY    = ../mips/dgemm_tcopy_4_msa.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
+DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY    = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY    = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
+STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
+STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
+STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
+else
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
+DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
+DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
+DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
+else
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 86df7e3a2..d61f5194a 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -34,12 +34,12 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_power10.c
-DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
-DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMINCOPY    =
+DGEMMITCOPY    =
+DGEMMONCOPY    =  dgemm_ncopy_8_power10.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
@@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
@@ -141,13 +141,9 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = casum.c
 ZASUMKERNEL  = zasum.c
 #
-SAXPYKERNEL  = saxpy.c
+SAXPYKERNEL  = saxpy_power10.c
 DAXPYKERNEL  = daxpy_power10.c
-ifneq ($(GCCVERSIONGTEQ9),1)
-CAXPYKERNEL  = caxpy_power9.S
-else
-CAXPYKERNEL  = caxpy.c
-endif
+CAXPYKERNEL  = caxpy_power10.c
 ZAXPYKERNEL  = zaxpy_power10.c
 #
 SCOPYKERNEL  = scopy_power10.c
@@ -155,9 +151,9 @@ DCOPYKERNEL  = dcopy_power10.c
 CCOPYKERNEL  = ccopy_power10.c
 ZCOPYKERNEL  = zcopy_power10.c
 #
-SDOTKERNEL   =  sdot.c
-DDOTKERNEL   =  ddot.c
-DSDOTKERNEL  =  sdot.c
+SDOTKERNEL   =  sdot_power10.c
+DDOTKERNEL   =  ddot_power10.c
+DSDOTKERNEL  =  sdot_power10.c
 ifneq ($(GCCVERSIONGTEQ9),1)
 CDOTKERNEL   =  cdot_power9.S
 else
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index ab8fbfcd9..2bd2516de 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
 DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
new file mode 100644
index 000000000..56a5ab47a
--- /dev/null
+++ b/kernel/power/caxpy_microk_power10.c
@@ -0,0 +1,196 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void caxpy_kernel_8 (long n, float *x, float *y,
+			    float alpha_r, float alpha_i)
+{
+#if !defined(CONJ)
+  static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
+#else
+  static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
+#endif
+  const float *mvecp = mvec;
+  /* We have to load reverse mask for big endian.  */
+  /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
+
+  __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+  long ytmp;
+
+  __asm__
+    (
+       "xscvdpspn 32, %7    \n\t"
+       "xscvdpspn 33, %8    \n\t"
+       "xxspltw 32, 32, 0   \n\t"
+       "xxspltw 33, 33, 0   \n\t"
+       "lxvd2x          36, 0, %9       \n\t"   // mvec
+
+#if !defined(CONJ)
+       "xvmulsp		33, 33, 36	\n\t"	// alpha_i * mvec
+#else
+       "xvmulsp		32, 32, 36	\n\t"	// alpha_r * mvec
+#endif
+       "mr		%4, %3		\n\t"
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "lxvp		42, 32(%2)	\n\t"	// x2
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "lxvp		46, 96(%2)	\n\t"	// x6
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+       "one%=:				\n\t"
+
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+       "lxvp		42, 32(%2)	\n\t"	// x2
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+       "lxvp		46, 96(%2)	\n\t"	// x6
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "addi		%2, %2, 128	\n\t"
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+       "two%=:				\n\t"
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=b" (ytmp)	// 4 
+     :
+       "m" (*x),
+       "m" (*mvecp),
+       "d" (alpha_r),	// 7
+       "d" (alpha_i),	// 8
+       "4" (mvecp),	// 9
+       "wa" (mask)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59"
+     );
+}
diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c
new file mode 100644
index 000000000..14b8cda67
--- /dev/null
+++ b/kernel/power/caxpy_power10.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "caxpy_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
+{
+	BLASLONG register i  = 0;
+	BLASLONG register ix = 0;
+ 
+	
+
+	while(i < n)
+        {
+#if !defined(CONJ)
+              y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+              y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+              y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+              y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+              y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+              y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+              ix+=4 ;
+              i+=2 ;
+
+       }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+		{
+			caxpy_kernel_8 (n1, x, y, da_r, da_i);
+			ix = 2 * n1;
+		}
+		i = n1;
+		while(i < n)
+		{
+#if !defined(CONJ)
+                	y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                	y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                	y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                	y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+			i++ ;
+			ix += 2;
+
+		}
+		return(0);
+
+
+	}
+
+	inc_x *=2;
+	inc_y *=2;
+
+	while(i < n)
+	{
+
+#if !defined(CONJ)
+                y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c
index ebe91a80f..8640efcfd 100644
--- a/kernel/power/daxpy_power10.c
+++ b/kernel/power/daxpy_power10.c
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -16;
+                if ( n >= 16 )
+                {
+                       BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+                        for (i = 0; i < align; i++) {
+                          y[i] += da * x[i] ;
+                        }
+                }
+                BLASLONG n1 = (n-i) & -16;
+                if ( n1 )
+                      daxpy_kernel_8(n1, &x[i], &y[i], da);
 
-		if ( n1 )
-			daxpy_kernel_8(n1, x, y, da);
+                i += n1;
 
-		i = n1;
 		while(i < n)
 		{
 
diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c
new file mode 100644
index 000000000..3a9865cc0
--- /dev/null
+++ b/kernel/power/ddot_microk_power10.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static double ddot_kernel_8 (long n, double *x, double *y)
+{
+  double dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       XXSWAPD_S(33,32)
+
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=d" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c
new file mode 100644
index 000000000..302dceb68
--- /dev/null
+++ b/kernel/power/ddot_power10.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "ddot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	FLOAT  dot = 0.0 ;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+			dot = ddot_kernel_8(n1, x, y);
+
+		i = n1;
+		while(i < n)
+		{
+
+			dot += y[i] * x[i] ;
+			i++ ;
+
+		}
+		return(dot);
+
+
+	}
+
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+
+        BLASLONG n1 = n & -4;	
+
+	while(i < n1)
+	{
+
+		FLOAT m1 = y[iy]       * x[ix] ;
+		FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+		FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+		FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+
+		temp1 += m1+m3;
+		temp2 += m2+m4;
+
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		temp1 += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	dot = temp1 + temp2;
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index b2a29140e..b531799a6 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
   off = -offset;
 #endif
   v4sf_t valpha = { alpha, alpha };
-  N = n >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      for (j = 0; j < (m >> 3); j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __vector_pair rowB, rowB1;
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
+	  __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 3];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  SAVE_ACC (&acc4, 4);
+	  SAVE_ACC1 (&acc5, 4);
+	  SAVE_ACC (&acc6, 6);
+	  SAVE_ACC1 (&acc7, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 8)
+#endif
+	}
+      if (m & 4)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 2];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 8)
+#endif
+	}
+      if (m & 2)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 1];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 8)
+#endif
+	}
+      if (m & 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
+	      v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
+	      v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
+	      v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	      t2 += rowA * rowB2;
+	      t3 += rowA * rowB3;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t1[0];
+	  CO[3 * ldc] = t1[1];
+	  CO[4 * ldc] = t2[0];
+	  CO[5 * ldc] = t2[1];
+	  CO[6 * ldc] = t3[0];
+	  CO[7 * ldc] = t3[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t1[0];
+	  CO[3 * ldc] += t1[1];
+	  CO[4 * ldc] += t2[0];
+	  CO[5 * ldc] += t2[1];
+	  CO[6 * ldc] += t3[0];
+	  CO[7 * ldc] += t3[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 8)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;                 // number of values in A
+#endif
+      B += k << 3;
+    }
+  if (n & 4)
+    {
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
-	{
-          FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 4);
-#else
-          BO = B;
-          temp = k;
-#endif
-	  v4sf_t *rowC;
-	  v4sf_t result[4];
-	  BLASLONG l = 0;
-	  PREFETCH1 (CO, 0);
-	  PREFETCH1 (CO + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 0);
-	  PREFETCH1 (CO, 128);
-	  PREFETCH1 (CO + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 128);
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-	  vec_t *rowA = (vec_t *) & AO[0];
-	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
-	  for (l = 1; l < temp; l++)
-	    {
-	      rowA = (vec_t *) & AO[l << 4];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
-	    }
-	  SAVE_ACC (&acc0, 0);
-	  SAVE_ACC (&acc2, 4);
-	  SAVE_ACC (&acc1, 2);
-	  SAVE_ACC (&acc3, 6);
-	  SAVE_ACC (&acc4, 8);
-	  SAVE_ACC (&acc6, 12);
-	  SAVE_ACC (&acc5, 10);
-	  SAVE_ACC (&acc7, 14);
-	  AO += temp << 4;
-	  BO += temp << 2;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 4)
-#endif
-	  CO += 16;
-	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc << 1;
       AO = A;
-      i = m >> 4;
-      for (j = 0; j < i; j++)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 2);
-#else
-          BO = B;
-          temp = k;
-#endif
-	  v4sf_t *rowC;
-	  v4sf_t result[4];
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
-	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  vec_t *rowA = (vec_t *) & AO[0];
-	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
-	  for (l = 1; l < temp; l++)
-	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      rowA = (vec_t *) & AO[l << 4];
-	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
-	    }
-	  SAVE2x4_ACC (&acc0, 0);
-	  SAVE2x4_ACC (&acc1, 2);
-	  SAVE2x4_ACC (&acc2, 4);
-	  SAVE2x4_ACC (&acc3, 6);
-	  SAVE2x4_ACC (&acc4, 8);
-	  SAVE2x4_ACC (&acc5, 10);
-	  SAVE2x4_ACC (&acc6, 12);
-	  SAVE2x4_ACC (&acc7, 14);
-	  CO += 16;
-	  AO += temp << 4;
-	  BO += temp << 1;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 2)
-#endif
-	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 1)
-#else
-          BO = B;
-          temp = k;
-#endif
-	  BLASLONG l = 0;
-	  v4sf_t t = { 0, 0 };
-	  v4sf_t t1 = { 0, 0 };
-	  v4sf_t t2 = { 0, 0 };
-	  v4sf_t t3 = { 0, 0 };
-	  v4sf_t t4 = { 0, 0 };
-	  v4sf_t t5 = { 0, 0 };
-	  v4sf_t t6 = { 0, 0 };
-	  v4sf_t t7 = { 0, 0 };
-	  for (l = 0; l < temp; l++)
-	    {
-	      v4sf_t rowB = { BO[l], BO[l] };
-	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
-	      v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
-	      v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
-	      v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
-	      v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
-	      v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
-	      v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
-	      v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
-	      t += rowA * rowB;
-	      t1 += rowA1 * rowB;
-	      t2 += rowA2 * rowB;
-	      t3 += rowA3 * rowB;
-	      t4 += rowA4 * rowB;
-	      t5 += rowA5 * rowB;
-	      t6 += rowA6 * rowB;
-	      t7 += rowA7 * rowB;
-	    }
-	  t = t * valpha;
-	  t1 = t1 * valpha;
-	  t2 = t2 * valpha;
-	  t3 = t3 * valpha;
-	  t4 = t4 * valpha;
-	  t5 = t5 * valpha;
-	  t6 = t6 * valpha;
-	  t7 = t7 * valpha;
-#if defined(TRMMKERNEL)
-	  CO[0] = t[0];
-	  CO[1] = t[1];
-	  CO[2] = t1[0];
-	  CO[3] = t1[1];
-	  CO[4] = t2[0];
-	  CO[5] = t2[1];
-	  CO[6] = t3[0];
-	  CO[7] = t3[1];
-	  CO[8] = t4[0];
-	  CO[9] = t4[1];
-	  CO[10] = t5[0];
-	  CO[11] = t5[1];
-	  CO[12] = t6[0];
-	  CO[13] = t6[1];
-	  CO[14] = t7[0];
-	  CO[15] = t7[1];
-#else
-	  CO[0] += t[0];
-	  CO[1] += t[1];
-	  CO[2] += t1[0];
-	  CO[3] += t1[1];
-	  CO[4] += t2[0];
-	  CO[5] += t2[1];
-	  CO[6] += t3[0];
-	  CO[7] += t3[1];
-	  CO[8] += t4[0];
-	  CO[9] += t4[1];
-	  CO[10] += t5[0];
-	  CO[11] += t5[1];
-	  CO[12] += t6[0];
-	  CO[13] += t6[1];
-	  CO[14] += t7[0];
-	  CO[15] += t7[1];
-#endif
-	  AO += temp << 4;
-	  BO += temp;
-	  CO += 16;
-	  i -= 16;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 1)
-#endif
-	}
-      while (i >= 8)
+      for (i = 0; i < (m >> 3); i++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c
new file mode 100644
index 000000000..9836c2e7f
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_8_power10.c
@@ -0,0 +1,326 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <altivec.h>
+#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp09, ctemp17, ctemp33;
+  IFLOAT ctemp25, ctemp41;
+  IFLOAT ctemp49, ctemp57;
+
+  aoffset = a;
+  boffset = b;
+
+  j = (n >> 3);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1 + lda;
+      aoffset3  = aoffset2 + lda;
+      aoffset4  = aoffset3 + lda;
+      aoffset5  = aoffset4 + lda;
+      aoffset6  = aoffset5 + lda;
+      aoffset7  = aoffset6 + lda;
+      aoffset8  = aoffset7 + lda;
+      aoffset += 8 * lda;
+
+      i = (m >> 3);
+      if (i > 0){
+	do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	PREFETCHA (aoffset5, 384);
+	PREFETCHA (aoffset6, 384);
+	PREFETCHA (aoffset7, 384);
+	PREFETCHA (aoffset8, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset1 +  4);
+	__vector double va3 = *(__vector double*)(aoffset1 +  6);
+
+	__vector double va4 = *(__vector double*)(aoffset2 +  0);
+	__vector double va5 = *(__vector double*)(aoffset2 +  2);
+	__vector double va6 = *(__vector double*)(aoffset2 +  4);
+	__vector double va7 = *(__vector double*)(aoffset2 +  6);
+
+	__vector double va8 = *(__vector double*)(aoffset3 +  0);
+	__vector double va9 = *(__vector double*)(aoffset3 +  2);
+	__vector double va10 = *(__vector double*)(aoffset3 + 4);
+	__vector double va11 = *(__vector double*)(aoffset3 + 6);
+
+	__vector double va12 = *(__vector double*)(aoffset4 +  0);
+	__vector double va13 = *(__vector double*)(aoffset4 +  2);
+	__vector double va14 = *(__vector double*)(aoffset4 +  4);
+	__vector double va15 = *(__vector double*)(aoffset4 +  6);
+
+	__vector double va16 = *(__vector double*)(aoffset5 +  0);
+	__vector double va17 = *(__vector double*)(aoffset5 +  2);
+	__vector double va18 = *(__vector double*)(aoffset5 +  4);
+	__vector double va19 = *(__vector double*)(aoffset5 +  6);
+
+	__vector double va20 = *(__vector double*)(aoffset6 +  0);
+	__vector double va21 = *(__vector double*)(aoffset6 +  2);
+	__vector double va22 = *(__vector double*)(aoffset6 +  4);
+	__vector double va23 = *(__vector double*)(aoffset6 +  6);
+
+	__vector double va24 = *(__vector double*)(aoffset7 +  0);
+	__vector double va25 = *(__vector double*)(aoffset7 +  2);
+	__vector double va26 = *(__vector double*)(aoffset7 + 4);
+	__vector double va27 = *(__vector double*)(aoffset7 + 6);
+
+	__vector double va28 = *(__vector double*)(aoffset8 +  0);
+	__vector double va29 = *(__vector double*)(aoffset8 +  2);
+	__vector double va30 = *(__vector double*)(aoffset8 +  4);
+	__vector double va31 = *(__vector double*)(aoffset8 +  6);
+
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va4, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va8, va12, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va16, va20, 0);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va24, va28, 0);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va0, va4, 3);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va8, va12, 3);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va16, va20, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va24, va28, 3);
+
+	*(__vector double*)(boffset +  16) = vec_xxpermdi(va1, va5, 0);
+	*(__vector double*)(boffset +  18) = vec_xxpermdi(va9, va13, 0);
+	*(__vector double*)(boffset +  20) = vec_xxpermdi(va17, va21, 0);
+	*(__vector double*)(boffset +  22) = vec_xxpermdi(va25, va29, 0);
+	*(__vector double*)(boffset +  24) = vec_xxpermdi(va1, va5, 3);
+	*(__vector double*)(boffset +  26) = vec_xxpermdi(va9, va13, 3);
+	*(__vector double*)(boffset +  28) = vec_xxpermdi(va17, va21, 3);
+	*(__vector double*)(boffset +  30) = vec_xxpermdi(va25, va29, 3);
+
+	*(__vector double*)(boffset +  32) = vec_xxpermdi(va2, va6, 0);
+	*(__vector double*)(boffset +  34) = vec_xxpermdi(va10, va14, 0);
+	*(__vector double*)(boffset +  36) = vec_xxpermdi(va18, va22, 0);
+	*(__vector double*)(boffset +  38) = vec_xxpermdi(va26, va30, 0);
+	*(__vector double*)(boffset +  40) = vec_xxpermdi(va2, va6, 3);
+	*(__vector double*)(boffset +  42) = vec_xxpermdi(va10, va14, 3);
+	*(__vector double*)(boffset +  44) = vec_xxpermdi(va18, va22, 3);
+	*(__vector double*)(boffset +  46) = vec_xxpermdi(va26, va30, 3);
+
+	*(__vector double*)(boffset +  48) = vec_xxpermdi(va3, va7, 0);
+	*(__vector double*)(boffset +  50) = vec_xxpermdi(va11, va15, 0);
+	*(__vector double*)(boffset +  52) = vec_xxpermdi(va19, va23, 0);
+	*(__vector double*)(boffset +  54) = vec_xxpermdi(va27, va31, 0);
+	*(__vector double*)(boffset +  56) = vec_xxpermdi(va3, va7, 3);
+	*(__vector double*)(boffset +  58) = vec_xxpermdi(va11, va15, 3);
+	*(__vector double*)(boffset +  60) = vec_xxpermdi(va19, va23, 3);
+	*(__vector double*)(boffset +  62) = vec_xxpermdi(va27, va31, 3);
+	  aoffset1 +=  8;
+	  aoffset2 +=  8;
+	  aoffset3 +=  8;
+	  aoffset4 +=  8;
+	  aoffset5 +=  8;
+	  aoffset6 +=  8;
+	  aoffset7 +=  8;
+	  aoffset8 +=  8;
+	  boffset  += 64;
+	  i --;
+	}while(i > 0);
+      }
+
+      i = (m & 7);
+      if (i > 0){
+	do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp09 = *(aoffset2 +  0);
+	  ctemp17 = *(aoffset3 +  0);
+	  ctemp25 = *(aoffset4 +  0);
+	  ctemp33 = *(aoffset5 +  0);
+	  ctemp41 = *(aoffset6 +  0);
+	  ctemp49 = *(aoffset7 +  0);
+	  ctemp57 = *(aoffset8 +  0);
+
+	  *(boffset +  0) = ctemp01;
+	  *(boffset +  1) = ctemp09;
+	  *(boffset +  2) = ctemp17;
+	  *(boffset +  3) = ctemp25;
+	  *(boffset +  4) = ctemp33;
+	  *(boffset +  5) = ctemp41;
+	  *(boffset +  6) = ctemp49;
+	  *(boffset +  7) = ctemp57;
+
+	  aoffset1 ++;
+	  aoffset2 ++;
+	  aoffset3 ++;
+	  aoffset4 ++;
+	  aoffset5 ++;
+	  aoffset6 ++;
+	  aoffset7 ++;
+	  aoffset8 ++;
+
+	  boffset += 8;
+	  i --;
+	}while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset3  = aoffset2 + lda;
+    aoffset4  = aoffset3 + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset2 +  0);
+	__vector double va3 = *(__vector double*)(aoffset2 +  2);
+	__vector double va4 = *(__vector double*)(aoffset3 +  0);
+	__vector double va5 = *(__vector double*)(aoffset3 +  2);
+	__vector double va6 = *(__vector double*)(aoffset4 +  0);
+	__vector double va7 = *(__vector double*)(aoffset4 +  2);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va2, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va4, va6, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va0, va2, 3);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va4, va6, 3);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va1, va3, 0);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va5, va7, 0);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va1, va3, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va5, va7, 3);
+
+	aoffset1 +=  4;
+	aoffset2 +=  4;
+	aoffset3 +=  4;
+	aoffset4 +=  4;
+	boffset  +=  16;
+	i --;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset2 +  0);
+	ctemp03 = *(aoffset3 +  0);
+	ctemp04 = *(aoffset4 +  0);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+	*(boffset +  2) = ctemp03;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 ++;
+	aoffset2 ++;
+	aoffset3 ++;
+	aoffset4 ++;
+
+	boffset += 4;
+	i --;
+      }while(i > 0);
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset2 +  0);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va1, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va0, va1, 3);
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset  +=  4;
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp02;
+
+      aoffset1 ++;
+      aoffset2 ++;
+      boffset += 2;
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = m;
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+
+	*(boffset +  0) = ctemp01;
+
+	aoffset1 ++;
+	boffset  ++;
+	i --;
+      }while(i > 0);
+    }
+
+  } /* end of if(j > 0) */
+
+  return 0;
+}
diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
index 4be8a5f9b..e47de2cb5 100644
--- a/kernel/power/dgemv_n_microk_power10.c
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-/**************************************************************************************
-* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
 #define HAVE_KERNEL_4x4 1
 
 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
@@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
      );
 }
+static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
+{
+
+  double *a0;
+  double *a1;
+  double *a2;
+  double *a3;
+  double *a4;
+  double *a5;
+  double *a6;
+  double *a7;
+  long tmp;
+  __asm__
+    (
+       "lxvp		34, 0( %15)	\n\t"	// x0, x1
+       "lxvp		38, 32( %15)	\n\t"	// x4, x5
+
+       XXSPLTD_S(58,%x14,0)	// alpha, alpha
+       "sldi		%10, %17, 3	\n\t"	// lda * sizeof (double)
+       "xvmuldp         34, 34, 58      \n\t"   // x0 * alpha, x1 * alpha
+       "xvmuldp         35, 35, 58      \n\t"   // x2 * alpha, x3 * alpha
+       "xvmuldp         38, 38, 58      \n\t"   // x4 * alpha, x5 * alpha
+       "xvmuldp         39, 39, 58      \n\t"   // x6 * alpha, x7 * alpha
+
+       "li		%11, 32   \n\t"
+
+       "add		%4, %3, %10	\n\t"	// a0 = ap, a1 = a0 + lda
+       "add		%10, %10, %10	\n\t"	// 2 * lda
+       XXSPLTD_S(32,34,1)       // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,0)       // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,1)       // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,0)       // x3 * alpha, x3 * alpha
+       XXSPLTD_S(48,39,1)       // x6 * alpha, x6 * alpha
+       XXSPLTD_S(49,39,0)       // x7 * alpha, x7 * alpha
+       XXSPLTD_S(39,38,0)       // x5 * alpha, x5 * alpha
+       XXSPLTD_S(38,38,1)       // x4 * alpha, x4 * alpha
+
+       "add		%5, %3, %10	\n\t"	// a2 = a0 + 2 * lda
+       "add		%6, %4, %10	\n\t"	// a3 = a1 + 2 * lda
+       "add		%7, %5, %10	\n\t"	// a4 = a2 + 2 * lda
+       "add		%8, %6, %10	\n\t"	// a5 = a3 + 2 * lda
+       "add		%9, %7, %10	\n\t"	// a6 = a4 + 2 * lda
+       "add		%10, %8, %10	\n\t"	// a7 = a5 + 2 * lda
+
+       "lxvp		40, 0( %3)	\n\t"	// a0[0], a0[1]
+       "lxvp		42, 0( %4)	\n\t"	// a1[0], a1[1]
+       "lxvp		44, 0( %5)	\n\t"	// a2[0], a2[1]
+       "lxvp		46, 0( %6)	\n\t"	// a3[0], a3[1]
+       "lxvp		50, 0( %7)	\n\t"	// a4[0]
+       "lxvp		52, 0( %8)	\n\t"	// a5[0]
+       "lxvp		54, 0( %9)	\n\t"	// a6[0]
+       "lxvp		56, 0( %10)	\n\t"	// a7[0]
+
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "lxvp		36, 0( %2)	\n\t"	// y0, y1
+
+       "xvmaddadp       36, 40, 34      \n\t"
+       "xvmaddadp       37, 41, 34      \n\t"
+       "lxvpx		40, %3, %11	\n\t"	// a0[0], a0[1]
+       "xvmaddadp       36, 42, 35      \n\t"
+       "xvmaddadp       37, 43, 35      \n\t"
+       "lxvpx		42, %4, %11	\n\t"	// a1[0], a1[1]
+       "xvmaddadp       36, 44, 32      \n\t"
+       "xvmaddadp       37, 45, 32      \n\t"
+       "lxvpx		44, %5, %11	\n\t"	// a2[0], a2[1]
+       "xvmaddadp       36, 46, 33      \n\t"
+       "xvmaddadp       37, 47, 33      \n\t"
+       "lxvpx		46, %6, %11	\n\t"	// a3[0], a3[1]
+       "xvmaddadp       36, 50, 48      \n\t"
+       "xvmaddadp       37, 51, 48      \n\t"
+       "lxvpx		50, %7, %11	\n\t"	// a4[0]
+       "xvmaddadp       36, 52, 49      \n\t"
+       "xvmaddadp       37, 53, 49      \n\t"
+       "lxvpx		52, %8, %11	\n\t"	// a5[0]
+       "xvmaddadp       36, 54, 38      \n\t"
+       "xvmaddadp       37, 55, 38      \n\t"
+       "lxvpx		54, %9, %11	\n\t"	// a6[0]
+       "xvmaddadp       36, 56, 39      \n\t"
+       "xvmaddadp       37, 57, 39      \n\t"
+       "lxvpx		56, %10, %11	\n\t"	// a7[0]
+       "addi		%11, %11, 32    \n\t"
+
+       "stxvp		36, 0( %2)	\n\t"	// y0, y1
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "lxvp		36, 0( %2)	\n\t"	// y0, y1
+       "xvmaddadp       36, 40, 34      \n\t"
+       "xvmaddadp       37, 41, 34      \n\t"
+       "xvmaddadp       36, 42, 35      \n\t"
+       "xvmaddadp       37, 43, 35      \n\t"
+       "xvmaddadp       36, 44, 32      \n\t"
+       "xvmaddadp       37, 45, 32      \n\t"
+       "xvmaddadp       36, 46, 33      \n\t"
+       "xvmaddadp       37, 47, 33      \n\t"
+       "xvmaddadp       36, 50, 48      \n\t"
+       "xvmaddadp       37, 51, 48      \n\t"
+       "xvmaddadp       36, 52, 49      \n\t"
+       "xvmaddadp       37, 53, 49      \n\t"
+       "xvmaddadp       36, 54, 38      \n\t"
+       "xvmaddadp       37, 55, 38      \n\t"
+       "xvmaddadp       36, 56, 39      \n\t"
+       "xvmaddadp       37, 57, 39      \n\t"
+       "stxvp		36, 0( %2)	\n\t"	// y0, y1
+
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (y),	// 2
+       "=b" (a0),	// 3
+       "=b" (a1),	// 4
+       "=&b" (a2),	// 5
+       "=&b" (a3),	// 6
+       "=&b" (a4),	// 7
+       "=&b" (a5),	// 8
+       "=&b" (a6),	// 9
+       "=&b" (a7),	// 10
+       "=b" (tmp)
+     :
+       "m" (*x),
+       "m" (*ap),
+       "d" (alpha),	// 14
+       "r" (x),		// 15
+       "3" (ap),	// 16
+       "4" (lda)	// 17
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
+       "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
+     );
+}
diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c
index ad5f1ba0d..aba15ab4e 100644
--- a/kernel/power/dgemv_n_power10.c
+++ b/kernel/power/dgemv_n_power10.c
@@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <altivec.h>
-
-typedef __vector unsigned char  vec_t;
-typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
-typedef __vector_pair          __attribute__((aligned(8))) vecp_t;
 
 #include "dgemv_n_microk_power10.c"
 
-#define MMA(X, APTR, ACC) \
-        rX = (vec_t *) & X; \
-        rowA = *((vecp_t*)((void*)&APTR)); \
-        __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
-
-#define SAVE(ACC, Z) \
-        rowC = (v4sf_t *) &y[Z]; \
-        __builtin_mma_disassemble_acc ((void *)result, ACC); \
-        result[0][1] = result[1][0]; \
-        result[2][1] = result[3][0]; \
-        rowC[0] += valpha * result[0]; \
-        rowC[1] += valpha * result[2];
-
-void
-dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
-                    FLOAT * y, FLOAT alpha)
-{
-  BLASLONG i, j, tmp;
-  FLOAT *a0 = a_ptr;
-  FLOAT *x1 = xo;
-  vector double valpha = { alpha, alpha };
-  v4sf_t *rowC;
-  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-  v4sf_t result[4];
-  vecp_t rowA;
-  vec_t *rX;
-  tmp = (n / 32) * 32;
-  for (i = 0; i < tmp; i += 32)
-    {
-      xo = x1;
-      a0 = a_ptr;
-      __builtin_mma_xxsetaccz (&acc0);
-      __builtin_mma_xxsetaccz (&acc1);
-      __builtin_mma_xxsetaccz (&acc2);
-      __builtin_mma_xxsetaccz (&acc3);
-      __builtin_mma_xxsetaccz (&acc4);
-      __builtin_mma_xxsetaccz (&acc5);
-      __builtin_mma_xxsetaccz (&acc6);
-      __builtin_mma_xxsetaccz (&acc7);
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      SAVE (&acc0, i + 0);
-      SAVE (&acc1, i + 4);
-      SAVE (&acc2, i + 8);
-      SAVE (&acc3, i + 12);
-      SAVE (&acc4, i + 16);
-      SAVE (&acc5, i + 20);
-      SAVE (&acc6, i + 24);
-      SAVE (&acc7, i + 28);
-
-    }
-  for (i = tmp; i < n; i += 4)
-    {
-      xo = x1;
-      a0 = a_ptr;
-      __builtin_mma_xxsetaccz (&acc0);
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      SAVE (&acc0, i);
-    }
-}
-
-
 #define NBMAX 4096
 
 #ifndef HAVE_KERNEL_4x4
@@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
-	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda128 = lda << 7;
+	BLASLONG lda8 = lda << 3;
 
 	FLOAT xbuffer[8] __attribute__ ((aligned (16)));
 	FLOAT *ybuffer;
@@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         if ( n < 1 ) return(0);
 
 	ybuffer = buffer;
-	BLASLONG n128 = n >> 7;
-	n1 = (n - (n128 * 128)) >> 2;
-	n2 = (n - (n128 * 128)) & 3;
+	BLASLONG n8 = n >> 3;
+	n2 = n & 3;
 
         m3 = m & 3  ;
         m1 = m & -4 ;
@@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		if ( inc_x == 1 )
 		{
 
-			for( i = 0; i < n128 ; i++)
+			for( i = 0; i < n8 ; i++)
 			{
-				dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
-				a_ptr += lda128;
-				x_ptr += 128;
+				dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda8;
+				x_ptr += 8;
 			}
 
-			for( i = 0; i < n1 ; i++)
+			if( n & 4 )
 			{
 				dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
 				a_ptr += lda4;
@@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		}
 		else
 		{
-			for( i = 0; i < n128 ; i++)
+			for( i = 0; i < n8 ; i++)
 			{
-	                        FLOAT xbuffer[128] __attribute__ ((aligned (16)));
 				BLASLONG j;
-				for ( j = 0; j < 128 ; j++)
+				for ( j = 0; j < 8 ; j++)
 				{
 					xbuffer[j] = x_ptr[0];
 				        x_ptr += inc_x;
 				}
-				dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
-				a_ptr += lda128;
+				dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda8;
 			}
 
-			for( i = 0; i < n1 ; i++)
+			if( n & 4 )
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c
new file mode 100644
index 000000000..6ede1dcdd
--- /dev/null
+++ b/kernel/power/saxpy_microk_power10.c
@@ -0,0 +1,181 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
+{
+  __vector float t0 = {alpha, alpha,alpha, alpha};
+
+  __asm__
+    (
+
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		40, 64(%2)	\n\t"
+       "lxvp		42, 96(%2)	\n\t"
+       "lxvp		48, 128(%2)	\n\t"
+       "lxvp		50, 160(%2)	\n\t"
+       "lxvp		52, 192(%2)	\n\t"
+       "lxvp		54, 224(%2)	\n\t"
+
+       "lxvp		36, 0(%3)	\n\t"
+       "lxvp		38, 32(%3)	\n\t"
+       "lxvp		44, 64(%3)	\n\t"
+       "lxvp		46, 96(%3)	\n\t"
+       "lxvp		56, 128(%3)	\n\t"
+       "lxvp		58, 160(%3)	\n\t"
+       "lxvp		60, 192(%3)	\n\t"
+       "lxvp		62, 224(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align 5			\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "lxvp		34, 32(%2)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+
+       "lxvp		36, 256(%3)	\n\t"
+       "lxvp		38, 288(%3)	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+
+       "lxvp		40, 64(%2)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "lxvp		42, 96(%2)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+
+       "lxvp		44, 320(%3)	\n\t"
+       "lxvp		46, 352(%3)	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+
+       "lxvp		48, 128(%2)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "lxvp		50, 160(%2)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+
+       "lxvp		56, 384(%3)	\n\t"
+       "lxvp		58, 416(%3)	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+
+       "lxvp		52, 192(%2)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+
+       "lxvp		54, 224(%2)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+       "lxvp		60, 448(%3)	\n\t"
+       "lxvp		62, 480(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+       "addi		%3, %3, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "wa" (t0),	// 4
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+
+}
+
+
diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c
new file mode 100644
index 000000000..4a13c1f88
--- /dev/null
+++ b/kernel/power/saxpy_power10.c
@@ -0,0 +1,125 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "saxpy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_8
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG register i = 0;
+
+	while(i < n)
+        {
+              y[i]   += alpha * x[i];
+              y[i+1] += alpha * x[i+1];
+              y[i+2] += alpha * x[i+2];
+              y[i+3] += alpha * x[i+3];
+              y[i+4] += alpha * x[i+4];
+              y[i+5] += alpha * x[i+5];
+              y[i+6] += alpha * x[i+6];
+              y[i+7] += alpha * x[i+7];
+              i+=8 ;
+
+       }
+
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		if ( n >= 64 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				y[i] += da * x[i] ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -64;
+		if ( n1 )
+			saxpy_kernel_64(n1, &x[i], &y[i], da);
+
+		i += n1;
+		while(i < n)
+		{
+
+			y[i] += da * x[i] ;
+			i++ ;
+
+		}
+		return(0);
+
+
+	}
+
+	BLASLONG n1 = n & -4;
+
+	while(i < n1)
+	{
+
+		FLOAT m1      = da * x[ix] ;
+		FLOAT m2      = da * x[ix+inc_x] ;
+		FLOAT m3      = da * x[ix+2*inc_x] ;
+		FLOAT m4      = da * x[ix+3*inc_x] ;
+
+		y[iy]         += m1 ;
+		y[iy+inc_y]   += m2 ;
+		y[iy+2*inc_y] += m3 ;
+		y[iy+3*inc_y] += m4 ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c
new file mode 100644
index 000000000..2f028c5a0
--- /dev/null
+++ b/kernel/power/sdot_microk_power10.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static float sdot_kernel_16 (long n, float *x, float *y)
+{
+  float dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "xvmaddasp	38, 46, 54 	\n\t"
+       "xvmaddasp	39, 47, 55 	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "xvmaddasp	38, 46, 54	\n\t"
+       "xvmaddasp	39, 47, 55	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%x0, 32		\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=f" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c
new file mode 100644
index 000000000..b61f0a90d
--- /dev/null
+++ b/kernel/power/sdot_power10.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "sdot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+#if defined (DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+#if defined (DSDOT)
+        double mydot = 0.0;
+        FLOAT asmdot = 0.0;
+#else
+	FLOAT mydot=0.0;
+#endif
+	BLASLONG n1;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+	        n1 = n & (BLASLONG)(-32);
+
+		if ( n1 )
+#if defined(DSDOT)
+			{
+			FLOAT *x1=x;
+			FLOAT *y1=y;
+			BLASLONG n2 = 32;
+			while (i<n1) {
+				asmdot = sdot_kernel_16(n2, x1, y1);
+				mydot += (double)asmdot;
+				asmdot=0.;
+				x1+=32;
+				y1+=32;
+				i+=32;
+			}
+		}
+#else		
+			mydot = sdot_kernel_16(n1, x, y);
+#endif
+		i = n1;
+		while(i < n)
+		{
+#if defined(DSDOT)
+			dot += (double)y[i] * (double)x[i] ;
+#else
+			dot += y[i] * x[i] ;
+#endif
+			i++ ;
+
+		}
+
+		dot+=mydot;
+		return(dot);
+
+
+	}
+
+	n1 = n & (BLASLONG)(-2);
+
+	while(i < n1)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
+#else
+		dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+#endif
+		ix  += inc_x*2 ;
+		iy  += inc_y*2 ;
+		i+=2 ;
+
+	}
+
+	while(i < n)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] ;
+#else
+		dot += y[iy] * x[ix] ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c
index 9fbf84695..80f495f70 100644
--- a/kernel/power/sgemm_kernel_power10.c
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 
   v4sf_t valpha = { alpha, alpha, alpha, alpha };
-  N = n >> 3;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 4); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 	    CO += 16;
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 8)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 8)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 8)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 3;
     }
-  N = (n & 7) >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 4)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 4)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 2)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
+      for (i = 0; i < (m >> 4); i++)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 4;
 	  BO += temp;
 	  CO += 16;
-	  i -= 16;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (16, 1)
 #endif
 	}
-      while (i >= 8)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c
new file mode 100644
index 000000000..5ca1603a6
--- /dev/null
+++ b/kernel/power/trsm_kernel_LN_power10.c
@@ -0,0 +1,1280 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   VbS7 = vec_splat(Vb[31], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   c0[6] -= c0[7] * a[62];
+   c1[6] -= c1[7] * a[62];
+   c2[6] -= c2[7] * a[62];
+   c3[6] -= c3[7] * a[62];
+   c4[6] -= c4[7] * a[62];
+   c5[6] -= c5[7] * a[62];
+   c6[6] -= c6[7] * a[62];
+   c7[6] -= c7[7] * a[62];
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   VbS6 = vec_splat(Vb[27], 0);
+   VbS7 = vec_splat(Vb[27], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]);
+   c0[4] -= c0[5] * a[44];
+   c1[4] -= c1[5] * a[44];
+   c2[4] -= c2[5] * a[44];
+   c3[4] -= c3[5] * a[44];
+   c4[4] -= c4[5] * a[44];
+   c5[4] -= c5[5] * a[44];
+   c6[4] -= c6[5] * a[44];
+   c7[4] -= c7[5] * a[44];
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]);
+   
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]);
+   c0[2] -= c0[3] * a[26];
+   c1[2] -= c1[3] * a[26];
+   c2[2] -= c2[3] * a[26];
+   c3[2] -= c3[3] * a[26];
+   c4[2] -= c4[3] * a[26];
+   c5[2] -= c5[3] * a[26];
+   c6[2] -= c6[3] * a[26];
+   c7[2] -= c7[3] * a[26];
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]);
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   c0[0] -= c0[1] * a[8];
+   c1[0] -= c1[1] * a[8];
+   c2[0] -= c2[1] * a[8];
+   c3[0] -= c3[1] * a[8];
+   c4[0] -= c4[1] * a[8];
+   c5[0] -= c5[1] * a[8];
+   c6[0] -= c6[1] * a[8];
+   c7[0] -= c7[1] * a[8];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+   int  j;
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+   VbS0 = vec_splat(Vb[30], 0);
+   VbS1 = vec_splat(Vb[30], 1);
+   VbS2 = vec_splat(Vb[30], 2);
+   VbS3 = vec_splat(Vb[30], 3);
+   VbS4 = vec_splat(Vb[31], 0);
+   VbS5 = vec_splat(Vb[31], 1);
+   VbS6 = vec_splat(Vb[31], 2);
+   VbS7 = vec_splat(Vb[31], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]);
+   c0[12] -= b[120] * a[252];
+   c0[13] -= b[120] * a[253];
+   c0[14] -= b[120] * a[254];
+   c1[12] -= b[121] * a[252];
+   c1[13] -= b[121] * a[253];
+   c1[14] -= b[121] * a[254];
+   c2[12] -= b[122] * a[252];
+   c2[13] -= b[122] * a[253];
+   c2[14] -= b[122] * a[254];
+   c3[12] -= b[123] * a[252];
+   c3[13] -= b[123] * a[253];
+   c3[14] -= b[123] * a[254];
+   c4[12] -= b[124] * a[252];
+   c4[13] -= b[124] * a[253];
+   c4[14] -= b[124] * a[254];
+   c5[12] -= b[125] * a[252];
+   c5[13] -= b[125] * a[253];
+   c5[14] -= b[125] * a[254];
+   c6[12] -= b[126] * a[252];
+   c6[13] -= b[126] * a[253];
+   c6[14] -= b[126] * a[254];
+   c7[12] -= b[127] * a[252];
+   c7[13] -= b[127] * a[253];
+   c7[14] -= b[127] * a[254];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[28], 2);
+   VbS3 = vec_splat(Vb[28], 3);
+   VbS4 = vec_splat(Vb[29], 0);
+   VbS5 = vec_splat(Vb[29], 1);
+   VbS6 = vec_splat(Vb[29], 2);
+   VbS7 = vec_splat(Vb[29], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]);
+   c0[12] -= b[112] * a[236];
+   c0[13] -= b[112] * a[237];
+   c1[12] -= b[113] * a[236];
+   c1[13] -= b[113] * a[237];
+   c2[12] -= b[114] * a[236];
+   c2[13] -= b[114] * a[237];
+   c3[12] -= b[115] * a[236];
+   c3[13] -= b[115] * a[237];
+   c4[12] -= b[116] * a[236];
+   c4[13] -= b[116] * a[237];
+   c5[12] -= b[117] * a[236];
+   c5[13] -= b[117] * a[237];
+   c6[12] -= b[118] * a[236];
+   c6[13] -= b[118] * a[237];
+   c7[12] -= b[119] * a[236];
+   c7[13] -= b[119] * a[237];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   VbS0 = vec_splat(Vb[26], 0);
+   VbS1 = vec_splat(Vb[26], 1);
+   VbS2 = vec_splat(Vb[26], 2);
+   VbS3 = vec_splat(Vb[26], 3);
+   VbS4 = vec_splat(Vb[27], 0);
+   VbS5 = vec_splat(Vb[27], 1);
+   VbS6 = vec_splat(Vb[27], 2);
+   VbS7 = vec_splat(Vb[27], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]);
+   c0[12] -= b[104] * a[220];
+   c1[12] -= b[105] * a[220];
+   c2[12] -= b[106] * a[220];
+   c3[12] -= b[107] * a[220];
+   c4[12] -= b[108] * a[220];
+   c5[12] -= b[109] * a[220];
+   c6[12] -= b[110] * a[220];
+   c7[12] -= b[111] * a[220];
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[24], 2);
+   VbS3 = vec_splat(Vb[24], 3);
+   VbS4 = vec_splat(Vb[25], 0);
+   VbS5 = vec_splat(Vb[25], 1);
+   VbS6 = vec_splat(Vb[25], 2);
+   VbS7 = vec_splat(Vb[25], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]);
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]);
+   c0[ 8] -= b[88] * a[184];
+   c0[ 9] -= b[88] * a[185];
+   c0[10] -= b[88] * a[186];
+   c1[ 8] -= b[89] * a[184];
+   c1[ 9] -= b[89] * a[185];
+   c1[10] -= b[89] * a[186];
+   c2[ 8] -= b[90] * a[184];
+   c2[ 9] -= b[90] * a[185];
+   c2[10] -= b[90] * a[186];
+   c3[ 8] -= b[91] * a[184];
+   c3[ 9] -= b[91] * a[185];
+   c3[10] -= b[91] * a[186];
+   c4[ 8] -= b[92] * a[184];
+   c4[ 9] -= b[92] * a[185];
+   c4[10] -= b[92] * a[186];
+   c5[ 8] -= b[93] * a[184];
+   c5[ 9] -= b[93] * a[185];
+   c5[10] -= b[93] * a[186];
+   c6[ 8] -= b[94] * a[184];
+   c6[ 9] -= b[94] * a[185];
+   c6[10] -= b[94] * a[186];
+   c7[ 8] -= b[95] * a[184];
+   c7[ 9] -= b[95] * a[185];
+   c7[10] -= b[95] * a[186];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]);
+   c0[8] -= b[80] * a[168];
+   c0[9] -= b[80] * a[169];
+   c1[8] -= b[81] * a[168];
+   c1[9] -= b[81] * a[169];
+   c2[8] -= b[82] * a[168];
+   c2[9] -= b[82] * a[169];
+   c3[8] -= b[83] * a[168];
+   c3[9] -= b[83] * a[169];
+   c4[8] -= b[84] * a[168];
+   c4[9] -= b[84] * a[169];
+   c5[8] -= b[85] * a[168];
+   c5[9] -= b[85] * a[169];
+   c6[8] -= b[86] * a[168];
+   c6[9] -= b[86] * a[169];
+   c7[8] -= b[87] * a[168];
+   c7[9] -= b[87] * a[169];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]);
+   c0[8] -= b[72] * a[152];
+   c1[8] -= b[73] * a[152];
+   c2[8] -= b[74] * a[152];
+   c3[8] -= b[75] * a[152];
+   c4[8] -= b[76] * a[152];
+   c5[8] -= b[77] * a[152];
+   c6[8] -= b[78] * a[152];
+   c7[8] -= b[79] * a[152];
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]);
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   c0[4] -= b[56] * a[116];
+   c0[5] -= b[56] * a[117];
+   c0[6] -= b[56] * a[118];
+   c1[4] -= b[57] * a[116];
+   c1[5] -= b[57] * a[117];
+   c1[6] -= b[57] * a[118];
+   c2[4] -= b[58] * a[116];
+   c2[5] -= b[58] * a[117];
+   c2[6] -= b[58] * a[118];
+   c3[4] -= b[59] * a[116];
+   c3[5] -= b[59] * a[117];
+   c3[6] -= b[59] * a[118];
+   c4[4] -= b[60] * a[116];
+   c4[5] -= b[60] * a[117];
+   c4[6] -= b[60] * a[118];
+   c5[4] -= b[61] * a[116];
+   c5[5] -= b[61] * a[117];
+   c5[6] -= b[61] * a[118];
+   c6[4] -= b[62] * a[116];
+   c6[5] -= b[62] * a[117];
+   c6[6] -= b[62] * a[118];
+   c7[4] -= b[63] * a[116];
+   c7[5] -= b[63] * a[117];
+   c7[6] -= b[63] * a[118];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   c0[4] -= b[48] * a[100];
+   c0[5] -= b[48] * a[101];
+   c1[4] -= b[49] * a[100];
+   c1[5] -= b[49] * a[101];
+   c2[4] -= b[50] * a[100];
+   c2[5] -= b[50] * a[101];
+   c3[4] -= b[51] * a[100];
+   c3[5] -= b[51] * a[101];
+   c4[4] -= b[52] * a[100];
+   c4[5] -= b[52] * a[101];
+   c5[4] -= b[53] * a[100];
+   c5[5] -= b[53] * a[101];
+   c6[4] -= b[54] * a[100];
+   c6[5] -= b[54] * a[101];
+   c7[4] -= b[55] * a[100];
+   c7[5] -= b[55] * a[101];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   c0[4] -= b[40] * a[84];
+   c1[4] -= b[41] * a[84];
+   c2[4] -= b[42] * a[84];
+   c3[4] -= b[43] * a[84];
+   c4[4] -= b[44] * a[84];
+   c5[4] -= b[45] * a[84];
+   c6[4] -= b[46] * a[84];
+   c7[4] -= b[47] * a[84];
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   c0[0] -= b[24] * a[48];
+   c0[1] -= b[24] * a[49];
+   c0[2] -= b[24] * a[50];
+   c1[0] -= b[25] * a[48];
+   c1[1] -= b[25] * a[49];
+   c1[2] -= b[25] * a[50];
+   c2[0] -= b[26] * a[48];
+   c2[1] -= b[26] * a[49];
+   c2[2] -= b[26] * a[50];
+   c3[0] -= b[27] * a[48];
+   c3[1] -= b[27] * a[49];
+   c3[2] -= b[27] * a[50];
+   c4[0] -= b[28] * a[48];
+   c4[1] -= b[28] * a[49];
+   c4[2] -= b[28] * a[50];
+   c5[0] -= b[29] * a[48];
+   c5[1] -= b[29] * a[49];
+   c5[2] -= b[29] * a[50];
+   c6[0] -= b[30] * a[48];
+   c6[1] -= b[30] * a[49];
+   c6[2] -= b[30] * a[50];
+   c7[0] -= b[31] * a[48];
+   c7[1] -= b[31] * a[49];
+   c7[2] -= b[31] * a[50];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   c0[0] -= b[16] * a[32];
+   c0[1] -= b[16] * a[33];
+   c1[0] -= b[17] * a[32];
+   c1[1] -= b[17] * a[33];
+   c2[0] -= b[18] * a[32];
+   c2[1] -= b[18] * a[33];
+   c3[0] -= b[19] * a[32];
+   c3[1] -= b[19] * a[33];
+   c4[0] -= b[20] * a[32];
+   c4[1] -= b[20] * a[33];
+   c5[0] -= b[21] * a[32];
+   c5[1] -= b[21] * a[33];
+   c6[0] -= b[22] * a[32];
+   c6[1] -= b[22] * a[33];
+   c7[0] -= b[23] * a[32];
+   c7[1] -= b[23] * a[33];
+
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   c0[0] -= b[ 8] * a[16];
+   c1[0] -= b[ 9] * a[16];
+   c2[0] -= b[10] * a[16];
+   c3[0] -= b[11] * a[16];
+   c4[0] -= b[12] * a[16];
+   c5[0] -= b[13] * a[16];
+   c6[0] -= b[14] * a[16];
+   c7[0] -= b[15] * a[16];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	if (m & i) {
+	  aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	  cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + i             * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(i, GEMM_UNROLL_N,
+		aa + (kk - i) * i             * COMPSIZE,
+		b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  kk -= i;
+	}
+      }
+    }
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+    if (i > 0) {
+      aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+      cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+      do {
+	if (k - kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa + GEMM_UNROLL_M * kk * COMPSIZE,
+		      b +  GEMM_UNROLL_N * kk * COMPSIZE,
+		      cc,
+		      ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	      b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	cc -= GEMM_UNROLL_M     * COMPSIZE;
+	kk -= GEMM_UNROLL_M;
+	i --;
+      } while (i > 0);
+    }
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = m + offset;
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	    if (m & i) {
+	      aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	      cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - i) * i * COMPSIZE,
+		    b  + (kk - i) * j * COMPSIZE,
+		    cc, ldc);
+
+	      kk -= i;
+	    }
+	  }
+	}
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+	  aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+	  cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b +  j             * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_M) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	    cc -= GEMM_UNROLL_M     * COMPSIZE;
+	    kk -= GEMM_UNROLL_M;
+	    i --;
+	  } while (i > 0);
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c
new file mode 100644
index 000000000..14ff12fe4
--- /dev/null
+++ b/kernel/power/trsm_kernel_LT_power10.c
@@ -0,0 +1,1265 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[1], 0);
+   VbS3 = vec_splat(Vb[1], 1);
+   VbS4 = vec_splat(Vb[2], 0);
+   VbS5 = vec_splat(Vb[2], 1);
+   VbS6 = vec_splat(Vb[3], 0);
+   VbS7 = vec_splat(Vb[3], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= c0[0] * a[1];
+   c1[1] -= c1[0] * a[1];
+   c2[1] -= c2[0] * a[1];
+   c3[1] -= c3[0] * a[1];
+   c4[1] -= c4[0] * a[1];
+   c5[1] -= c5[0] * a[1];
+   c6[1] -= c6[0] * a[1];
+   c7[1] -= c7[0] * a[1];
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[6], 0);
+   VbS5 = vec_splat(Vb[6], 1);
+   VbS6 = vec_splat(Vb[7], 0);
+   VbS7 = vec_splat(Vb[7], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= c0[2] * a[19];
+   c1[3] -= c1[2] * a[19];
+   c2[3] -= c2[2] * a[19];
+   c3[3] -= c3[2] * a[19];
+   c4[3] -= c4[2] * a[19];
+   c5[3] -= c5[2] * a[19];
+   c6[3] -= c6[2] * a[19];
+   c7[3] -= c7[2] * a[19];
+
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= c0[4] * a[37];
+   c1[5] -= c1[4] * a[37];
+   c2[5] -= c2[4] * a[37];
+   c3[5] -= c3[4] * a[37];
+   c4[5] -= c4[4] * a[37];
+   c5[5] -= c5[4] * a[37];
+   c6[5] -= c6[4] * a[37];
+   c7[5] -= c7[4] * a[37];
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   c0[7] -= c0[6] * a[55];
+   c1[7] -= c1[6] * a[55];
+   c2[7] -= c2[6] * a[55];
+   c3[7] -= c3[6] * a[55];
+   c4[7] -= c4[6] * a[55];
+   c5[7] -= c5[6] * a[55];
+   c6[7] -= c6[6] * a[55];
+   c7[7] -= c7[6] * a[55];
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+   int  j;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= b[0] * a[ 1];
+   c0[2] -= b[0] * a[ 2];
+   c0[3] -= b[0] * a[ 3];
+   c1[1] -= b[1] * a[ 1];
+   c1[2] -= b[1] * a[ 2];
+   c1[3] -= b[1] * a[ 3];
+   c2[1] -= b[2] * a[ 1];
+   c2[2] -= b[2] * a[ 2];
+   c2[3] -= b[2] * a[ 3];
+   c3[1] -= b[3] * a[ 1];
+   c3[2] -= b[3] * a[ 2];
+   c3[3] -= b[3] * a[ 3];
+   c4[1] -= b[4] * a[ 1];
+   c4[2] -= b[4] * a[ 2];
+   c4[3] -= b[4] * a[ 3];
+   c5[1] -= b[5] * a[ 1];
+   c5[2] -= b[5] * a[ 2];
+   c5[3] -= b[5] * a[ 3];
+   c6[1] -= b[6] * a[ 1];
+   c6[2] -= b[6] * a[ 2];
+   c6[3] -= b[6] * a[ 3];
+   c7[1] -= b[7] * a[ 1];
+   c7[2] -= b[7] * a[ 2];
+   c7[3] -= b[7] * a[ 3];
+ 
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+   VbS2 = vec_splat(Vb[2], 2);
+   VbS3 = vec_splat(Vb[2], 3);
+   VbS4 = vec_splat(Vb[3], 0);
+   VbS5 = vec_splat(Vb[3], 1);
+   VbS6 = vec_splat(Vb[3], 2);
+   VbS7 = vec_splat(Vb[3], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+   c0[2] -= b[ 8] * a[18];
+   c0[3] -= b[ 8] * a[19];
+   c1[2] -= b[ 9] * a[18];
+   c1[3] -= b[ 9] * a[19];
+   c2[2] -= b[10] * a[18];
+   c2[3] -= b[10] * a[19];
+   c3[2] -= b[11] * a[18];
+   c3[3] -= b[11] * a[19];
+   c4[2] -= b[12] * a[18];
+   c4[3] -= b[12] * a[19];
+   c5[2] -= b[13] * a[18];
+   c5[3] -= b[13] * a[19];
+   c6[2] -= b[14] * a[18];
+   c6[3] -= b[14] * a[19];
+   c7[2] -= b[15] * a[18];
+   c7[3] -= b[15] * a[19];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+   VbS3 = vec_splat(Vb[4], 3);
+   VbS4 = vec_splat(Vb[5], 0);
+   VbS5 = vec_splat(Vb[5], 1);
+   VbS6 = vec_splat(Vb[5], 2);
+   VbS7 = vec_splat(Vb[5], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= b[16] * a[35];
+   c1[3] -= b[17] * a[35];
+   c2[3] -= b[18] * a[35];
+   c3[3] -= b[19] * a[35];
+   c4[3] -= b[20] * a[35];
+   c5[3] -= b[21] * a[35];
+   c6[3] -= b[22] * a[35];
+   c7[3] -= b[23] * a[35];
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   VbS6 = vec_splat(Vb[7], 2);
+   VbS7 = vec_splat(Vb[7], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= b[32] * a[69];
+   c0[6] -= b[32] * a[70];
+   c0[7] -= b[32] * a[71];
+   c1[5] -= b[33] * a[69];
+   c1[6] -= b[33] * a[70];
+   c1[7] -= b[33] * a[71];
+   c2[5] -= b[34] * a[69];
+   c2[6] -= b[34] * a[70];
+   c2[7] -= b[34] * a[71];
+   c3[5] -= b[35] * a[69];
+   c3[6] -= b[35] * a[70];
+   c3[7] -= b[35] * a[71];
+   c4[5] -= b[36] * a[69];
+   c4[6] -= b[36] * a[70];
+   c4[7] -= b[36] * a[71];
+   c5[5] -= b[37] * a[69];
+   c5[6] -= b[37] * a[70];
+   c5[7] -= b[37] * a[71];
+   c6[5] -= b[38] * a[69];
+   c6[6] -= b[38] * a[70];
+   c6[7] -= b[38] * a[71];
+   c7[5] -= b[39] * a[69];
+   c7[6] -= b[39] * a[70];
+   c7[7] -= b[39] * a[71];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+   c0[6] -= b[40] * a[86];
+   c0[7] -= b[40] * a[87];
+   c1[6] -= b[41] * a[86];
+   c1[7] -= b[41] * a[87];
+   c2[6] -= b[42] * a[86];
+   c2[7] -= b[42] * a[87];
+   c3[6] -= b[43] * a[86];
+   c3[7] -= b[43] * a[87];
+   c4[6] -= b[44] * a[86];
+   c4[7] -= b[44] * a[87];
+   c5[6] -= b[45] * a[86];
+   c5[7] -= b[45] * a[87];
+   c6[6] -= b[46] * a[86];
+   c6[7] -= b[46] * a[87];
+   c7[6] -= b[47] * a[86];
+   c7[7] -= b[47] * a[87];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]);
+   c0[7] -= b[48] * a[103];
+   c1[7] -= b[49] * a[103];
+   c2[7] -= b[50] * a[103];
+   c3[7] -= b[51] * a[103];
+   c4[7] -= b[52] * a[103];
+   c5[7] -= b[53] * a[103];
+   c6[7] -= b[54] * a[103];
+   c7[7] -= b[55] * a[103];
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]);
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]);
+   c0[ 9] -= b[64] * a[137];
+   c0[10] -= b[64] * a[138];
+   c0[11] -= b[64] * a[139];
+   c1[ 9] -= b[65] * a[137];
+   c1[10] -= b[65] * a[138];
+   c1[11] -= b[65] * a[139];
+   c2[ 9] -= b[66] * a[137];
+   c2[10] -= b[66] * a[138];
+   c2[11] -= b[66] * a[139];
+   c3[ 9] -= b[67] * a[137];
+   c3[10] -= b[67] * a[138];
+   c3[11] -= b[67] * a[139];
+   c4[ 9] -= b[68] * a[137];
+   c4[10] -= b[68] * a[138];
+   c4[11] -= b[68] * a[139];
+   c5[ 9] -= b[69] * a[137];
+   c5[10] -= b[69] * a[138];
+   c5[11] -= b[69] * a[139];
+   c6[ 9] -= b[70] * a[137];
+   c6[10] -= b[70] * a[138];
+   c6[11] -= b[70] * a[139];
+   c7[ 9] -= b[71] * a[137];
+   c7[10] -= b[71] * a[138];
+   c7[11] -= b[71] * a[139];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]);
+   c0[10] -= b[72] * a[154];
+   c0[11] -= b[72] * a[155];
+   c1[10] -= b[73] * a[154];
+   c1[11] -= b[73] * a[155];
+   c2[10] -= b[74] * a[154];
+   c2[11] -= b[74] * a[155];
+   c3[10] -= b[75] * a[154];
+   c3[11] -= b[75] * a[155];
+   c4[10] -= b[76] * a[154];
+   c4[11] -= b[76] * a[155];
+   c5[10] -= b[77] * a[154];
+   c5[11] -= b[77] * a[155];
+   c6[10] -= b[78] * a[154];
+   c6[11] -= b[78] * a[155];
+   c7[10] -= b[79] * a[154];
+   c7[11] -= b[79] * a[155];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]);
+   c0[11] -= b[80] * a[171];
+   c1[11] -= b[81] * a[171];
+   c2[11] -= b[82] * a[171];
+   c3[11] -= b[83] * a[171];
+   c4[11] -= b[84] * a[171];
+   c5[11] -= b[85] * a[171];
+   c6[11] -= b[86] * a[171];
+   c7[11] -= b[87] * a[171];
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]);
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   c0[13] -= b[ 96] * a[205];
+   c0[14] -= b[ 96] * a[206];
+   c0[15] -= b[ 96] * a[207];
+   c1[13] -= b[ 97] * a[205];
+   c1[14] -= b[ 97] * a[206];
+   c1[15] -= b[ 97] * a[207];
+   c2[13] -= b[ 98] * a[205];
+   c2[14] -= b[ 98] * a[206];
+   c2[15] -= b[ 98] * a[207];
+   c3[13] -= b[ 99] * a[205];
+   c3[14] -= b[ 99] * a[206];
+   c3[15] -= b[ 99] * a[207];
+   c4[13] -= b[100] * a[205];
+   c4[14] -= b[100] * a[206];
+   c4[15] -= b[100] * a[207];
+   c5[13] -= b[101] * a[205];
+   c5[14] -= b[101] * a[206];
+   c5[15] -= b[101] * a[207];
+   c6[13] -= b[102] * a[205];
+   c6[14] -= b[102] * a[206];
+   c6[15] -= b[102] * a[207];
+   c7[13] -= b[103] * a[205];
+   c7[14] -= b[103] * a[206];
+   c7[15] -= b[103] * a[207];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   c0[14] -= b[104] * a[222];
+   c0[15] -= b[104] * a[223];
+   c1[14] -= b[105] * a[222];
+   c1[15] -= b[105] * a[223];
+   c2[14] -= b[106] * a[222];
+   c2[15] -= b[106] * a[223];
+   c3[14] -= b[107] * a[222];
+   c3[15] -= b[107] * a[223];
+   c4[14] -= b[108] * a[222];
+   c4[15] -= b[108] * a[223];
+   c5[14] -= b[109] * a[222];
+   c5[15] -= b[109] * a[223];
+   c6[14] -= b[110] * a[222];
+   c6[15] -= b[110] * a[223];
+   c7[14] -= b[111] * a[222];
+   c7[15] -= b[111] * a[223];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   c0[15] -= b[112] * a[239];
+   c1[15] -= b[113] * a[239];
+   c2[15] -= b[114] * a[239];
+   c3[15] -= b[115] * a[239];
+   c4[15] -= b[116] * a[239];
+   c5[15] -= b[117] * a[239];
+   c6[15] -= b[118] * a[239];
+   c7[15] -= b[119] * a[239];
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    while (i > 0) {
+
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+      if (well_aligned) {
+#ifdef DOUBLE
+	solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		 b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		  b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+      }
+      else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+      }
+
+      aa += GEMM_UNROLL_M * k * COMPSIZE;
+      cc += GEMM_UNROLL_M     * COMPSIZE;
+      kk += GEMM_UNROLL_M;
+      i --;
+    }
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	  kk += i;
+	}
+	i >>= 1;
+      }
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = offset;
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  kk += GEMM_UNROLL_M;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      kk += i;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c
new file mode 100644
index 000000000..92c26fcc3
--- /dev/null
+++ b/kernel/power/trsm_kernel_RN_power10.c
@@ -0,0 +1,828 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+   VbS0 = vec_splat(Vb[0], 1);
+   VbS1 = vec_splat(Vb[1], 0);
+   VbS2 = vec_splat(Vb[1], 1);
+   VbS3 = vec_splat(Vb[2], 0);
+   VbS4 = vec_splat(Vb[2], 1);
+   VbS5 = vec_splat(Vb[3], 0);
+   VbS6 = vec_splat(Vb[3], 1);
+   Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[5], 0);
+   VbS1 = vec_splat(Vb[5], 1);
+   VbS2 = vec_splat(Vb[6], 0);
+   VbS3 = vec_splat(Vb[6], 1);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[ 9], 1);
+   VbS1 = vec_splat(Vb[10], 0);
+   VbS2 = vec_splat(Vb[10], 1);
+   VbS3 = vec_splat(Vb[11], 0);
+   VbS4 = vec_splat(Vb[11], 1);
+   Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[15], 0);
+   VbS3 = vec_splat(Vb[15], 1);
+   Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[18], 1);
+   VbS1 = vec_splat(Vb[19], 0);
+   VbS2 = vec_splat(Vb[19], 1);
+   Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[23], 0);
+   VbS1 = vec_splat(Vb[23], 1);
+   Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[27], 1);
+   Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]);
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   
+   Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]);
+   Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]);
+   Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]);
+   Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+   Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[2], 1);
+   VbS1 = vec_splat(Vb[2], 2);
+   VbS2 = vec_splat(Vb[2], 3);
+   VbS3 = vec_splat(Vb[3], 0);
+   VbS4 = vec_splat(Vb[3], 1);
+   VbS5 = vec_splat(Vb[3], 2);
+   VbS6 = vec_splat(Vb[3], 3);
+   
+   Vc1[0] = vec_mul(VbS0, Vc1[0]);
+   Vc1[1] = vec_mul(VbS0, Vc1[1]);
+   Vc1[2] = vec_mul(VbS0, Vc1[2]);
+   Vc1[3] = vec_mul(VbS0, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[4], 2);
+   VbS1 = vec_splat(Vb[4], 3);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[5], 2);
+   VbS5 = vec_splat(Vb[5], 3);
+   
+   Vc2[0] = vec_mul(VbS0, Vc2[0]);
+   Vc2[1] = vec_mul(VbS0, Vc2[1]);
+   Vc2[2] = vec_mul(VbS0, Vc2[2]);
+   Vc2[3] = vec_mul(VbS0, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[6], 3);
+   VbS1 = vec_splat(Vb[7], 0);
+   VbS2 = vec_splat(Vb[7], 1);
+   VbS3 = vec_splat(Vb[7], 2);
+   VbS4 = vec_splat(Vb[7], 3);
+   
+   Vc3[0] = vec_mul(VbS0, Vc3[0]);
+   Vc3[1] = vec_mul(VbS0, Vc3[1]);
+   Vc3[2] = vec_mul(VbS0, Vc3[2]);
+   Vc3[3] = vec_mul(VbS0, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[9], 0);
+   VbS1 = vec_splat(Vb[9], 1);
+   VbS2 = vec_splat(Vb[9], 2);
+   VbS3 = vec_splat(Vb[9], 3);
+   
+   Vc4[0] = vec_mul(VbS0, Vc4[0]);
+   Vc4[1] = vec_mul(VbS0, Vc4[1]);
+   Vc4[2] = vec_mul(VbS0, Vc4[2]);
+   Vc4[3] = vec_mul(VbS0, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[11], 1);
+   VbS1 = vec_splat(Vb[11], 2);
+   VbS2 = vec_splat(Vb[11], 3);
+   
+   Vc5[0] = vec_mul(VbS0, Vc5[0]);
+   Vc5[1] = vec_mul(VbS0, Vc5[1]);
+   Vc5[2] = vec_mul(VbS0, Vc5[2]);
+   Vc5[3] = vec_mul(VbS0, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[13], 2);
+   VbS1 = vec_splat(Vb[13], 3);
+   
+   Vc6[0] = vec_mul(VbS0, Vc6[0]);
+   Vc6[1] = vec_mul(VbS0, Vc6[1]);
+   Vc6[2] = vec_mul(VbS0, Vc6[2]);
+   Vc6[3] = vec_mul(VbS0, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[15], 3);
+   
+   Vc7[0] = vec_mul(VbS0, Vc7[0]);
+   Vc7[1] = vec_mul(VbS0, Vc7[1]);
+   Vc7[2] = vec_mul(VbS0, Vc7[2]);
+   Vc7[3] = vec_mul(VbS0, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    if (i > 0) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa += GEMM_UNROLL_M * k * COMPSIZE;
+	cc += GEMM_UNROLL_M     * COMPSIZE;
+	i --;
+      } while (i > 0);
+    }
+
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	}
+	i >>= 1;
+      }
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c
new file mode 100644
index 000000000..529590f37
--- /dev/null
+++ b/kernel/power/trsm_kernel_RT_power10.c
@@ -0,0 +1,855 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]);
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+
+   Vc7[0] = vec_mul(VbS7, Vc7[0]);
+   Vc7[1] = vec_mul(VbS7, Vc7[1]);
+   Vc7[2] = vec_mul(VbS7, Vc7[2]);
+   Vc7[3] = vec_mul(VbS7, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+
+   Vc6[0] = vec_mul(VbS6, Vc6[0]);
+   Vc6[1] = vec_mul(VbS6, Vc6[1]);
+   Vc6[2] = vec_mul(VbS6, Vc6[2]);
+   Vc6[3] = vec_mul(VbS6, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+
+   Vc5[0] = vec_mul(VbS5, Vc5[0]);
+   Vc5[1] = vec_mul(VbS5, Vc5[1]);
+   Vc5[2] = vec_mul(VbS5, Vc5[2]);
+   Vc5[3] = vec_mul(VbS5, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+
+   Vc4[0] = vec_mul(VbS4, Vc4[0]);
+   Vc4[1] = vec_mul(VbS4, Vc4[1]);
+   Vc4[2] = vec_mul(VbS4, Vc4[2]);
+   Vc4[3] = vec_mul(VbS4, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+
+   Vc3[0] = vec_mul(VbS3, Vc3[0]);
+   Vc3[1] = vec_mul(VbS3, Vc3[1]);
+   Vc3[2] = vec_mul(VbS3, Vc3[2]);
+   Vc3[3] = vec_mul(VbS3, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+
+   Vc2[0] = vec_mul(VbS2, Vc2[0]);
+   Vc2[1] = vec_mul(VbS2, Vc2[1]);
+   Vc2[2] = vec_mul(VbS2, Vc2[2]);
+   Vc2[3] = vec_mul(VbS2, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+
+   Vc1[0] = vec_mul(VbS1, Vc1[0]);
+   Vc1[1] = vec_mul(VbS1, Vc1[1]);
+   Vc1[2] = vec_mul(VbS1, Vc1[2]);
+   Vc1[3] = vec_mul(VbS1, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+
+   VbS0 = vec_splat(Vb[0], 0);
+
+   Vc0[0] = vec_mul(VbS0, Vc0[0]);
+   Vc0[1] = vec_mul(VbS0, Vc0[1]);
+   Vc0[2] = vec_mul(VbS0, Vc0[2]);
+   Vc0[3] = vec_mul(VbS0, Vc0[3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+	aa  = a;
+	b -= j * k  * COMPSIZE;
+	c -= j * ldc* COMPSIZE;
+	cc  = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b  +  j            * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - j) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa += GEMM_UNROLL_M * k * COMPSIZE;
+	    cc += GEMM_UNROLL_M     * COMPSIZE;
+	    i --;
+	  } while (i > 0);
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  do {
+	    if (m & i) {
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - j) * i * COMPSIZE,
+		    b  + (kk - j) * j * COMPSIZE,
+		    cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+	    }
+	    i >>= 1;
+	  } while (i > 0);
+	}
+	kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = (m >> GEMM_UNROLL_M_SHIFT);
+      if (i > 0) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  if (well_aligned) { 
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	  }
+	  else {
+	  solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+	  }
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & (GEMM_UNROLL_M - 1)) {
+	i = (GEMM_UNROLL_M >> 1);
+	do {
+	  if (m & i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+	  }
+	  i >>= 1;
+	} while (i > 0);
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
new file mode 100644
index 000000000..68d68b5f8
--- /dev/null
+++ b/kernel/riscv64/KERNEL
@@ -0,0 +1,30 @@
+ifndef SCABS_KERNEL
+SCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL	= ../generic/lsame.c
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
new file mode 100644
index 000000000..0da66fa35
--- /dev/null
+++ b/kernel/riscv64/KERNEL.C910V
@@ -0,0 +1,190 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SAXPBYKERNEL  = axpby_vector.c
+DAXPBYKERNEL  = axpby_vector.c
+CAXPBYKERNEL  = zaxpby_vector.c
+ZAXPBYKERNEL  = zaxpby_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= ../generic/trmmkernel_16x4.c
+DTRMMKERNEL	= ../generic/trmmkernel_8x4.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  sgemm_kernel_16x4_c910v.c
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_8x4_c910v.c
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
new file mode 100644
index 000000000..ea6a8cf21
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -0,0 +1,164 @@
+SAMAXKERNEL  = ../riscv64/amax.c
+DAMAXKERNEL  = ../riscv64/amax.c
+CAMAXKERNEL  = ../riscv64/zamax.c
+ZAMAXKERNEL  = ../riscv64/zamax.c
+
+SAMINKERNEL  = ../riscv64/amin.c
+DAMINKERNEL  = ../riscv64/amin.c
+CAMINKERNEL  = ../riscv64/zamin.c
+ZAMINKERNEL  = ../riscv64/zamin.c
+
+SMAXKERNEL   = ../riscv64/max.c
+DMAXKERNEL   = ../riscv64/max.c
+
+SMINKERNEL   = ../riscv64/min.c
+DMINKERNEL   = ../riscv64/min.c
+
+ISAMAXKERNEL = ../riscv64/iamax.c
+IDAMAXKERNEL = ../riscv64/iamax.c
+ICAMAXKERNEL = ../riscv64/izamax.c
+IZAMAXKERNEL = ../riscv64/izamax.c
+
+ISAMINKERNEL = ../riscv64/iamin.c
+IDAMINKERNEL = ../riscv64/iamin.c
+ICAMINKERNEL = ../riscv64/izamin.c
+IZAMINKERNEL = ../riscv64/izamin.c
+
+ISMAXKERNEL  = ../riscv64/imax.c
+IDMAXKERNEL  = ../riscv64/imax.c
+
+ISMINKERNEL  = ../riscv64/imin.c
+IDMINKERNEL  = ../riscv64/imin.c
+
+SASUMKERNEL  = ../riscv64/asum.c
+DASUMKERNEL  = ../riscv64/asum.c
+CASUMKERNEL  = ../riscv64/zasum.c
+ZASUMKERNEL  = ../riscv64/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../riscv64/axpy.c
+DAXPYKERNEL  = ../riscv64/axpy.c
+CAXPYKERNEL  = ../riscv64/zaxpy.c
+ZAXPYKERNEL  = ../riscv64/zaxpy.c
+
+SCOPYKERNEL  = ../riscv64/copy.c
+DCOPYKERNEL  = ../riscv64/copy.c
+CCOPYKERNEL  = ../riscv64/zcopy.c
+ZCOPYKERNEL  = ../riscv64/zcopy.c
+
+SDOTKERNEL   = ../riscv64/dot.c
+DDOTKERNEL   = ../riscv64/dot.c
+CDOTKERNEL   = ../riscv64/zdot.c
+ZDOTKERNEL   = ../riscv64/zdot.c
+
+SNRM2KERNEL  = ../riscv64/nrm2.c
+DNRM2KERNEL  = ../riscv64/nrm2.c
+CNRM2KERNEL  = ../riscv64/znrm2.c
+ZNRM2KERNEL  = ../riscv64/znrm2.c
+
+SROTKERNEL   = ../riscv64/rot.c
+DROTKERNEL   = ../riscv64/rot.c
+CROTKERNEL   = ../riscv64/zrot.c
+ZROTKERNEL   = ../riscv64/zrot.c
+
+SSCALKERNEL  = ../riscv64/scal.c
+DSCALKERNEL  = ../riscv64/scal.c
+CSCALKERNEL  = ../riscv64/zscal.c
+ZSCALKERNEL  = ../riscv64/zscal.c
+
+SSWAPKERNEL  = ../riscv64/swap.c
+DSWAPKERNEL  = ../riscv64/swap.c
+CSWAPKERNEL  = ../riscv64/zswap.c
+ZSWAPKERNEL  = ../riscv64/zswap.c
+
+SGEMVNKERNEL = ../riscv64/gemv_n.c
+DGEMVNKERNEL = ../riscv64/gemv_n.c
+CGEMVNKERNEL = ../riscv64/zgemv_n.c
+ZGEMVNKERNEL = ../riscv64/zgemv_n.c
+
+SGEMVTKERNEL = ../riscv64/gemv_t.c
+DGEMVTKERNEL = ../riscv64/gemv_t.c
+CGEMVTKERNEL = ../riscv64/zgemv_t.c
+ZGEMVTKERNEL = ../riscv64/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c
new file mode 100644
index 000000000..792e68bd9
--- /dev/null
+++ b/kernel/riscv64/amax.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
new file mode 100644
index 000000000..b6aec131e
--- /dev/null
+++ b/kernel/riscv64/amax_vector.c
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                                ix += inc_xv*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c
new file mode 100644
index 000000000..78495a8e3
--- /dev/null
+++ b/kernel/riscv64/amin.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < minf )
+		{
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
new file mode 100644
index 000000000..53243ad56
--- /dev/null
+++ b/kernel/riscv64/amin_vector.c
@@ -0,0 +1,241 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        MASK_T mask0, mask1;
+	FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                                idx += inc_xv*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/asum.c b/kernel/riscv64/asum.c
new file mode 100644
index 000000000..b284ae3fc
--- /dev/null
+++ b/kernel/riscv64/asum.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += ABS(x[i]);
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
new file mode 100644
index 000000000..7ab7484e8
--- /dev/null
+++ b/kernel/riscv64/asum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
new file mode 100644
index 000000000..278747f75
--- /dev/null
+++ b/kernel/riscv64/axpby.c
@@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
new file mode 100644
index 000000000..432708db7
--- /dev/null
+++ b/kernel/riscv64/axpby_vector.c
@@ -0,0 +1,378 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	if (n < 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+
+        if(beta == 0.0){
+                if(alpha == 0.0){//alpha == 0 && beta == 0
+                        if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy0, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+
+                }else{//alpha != 0 && beta == 0, y = ax
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x !=1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+		}
+        }else{//beta != 0
+		if(alpha == 0.0){//alpha == 0 && beta != 0; y = by
+			if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+			}
+
+		}else{//alpha != 0 && beta != 0; y = ax + by
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1,gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x != 1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
new file mode 100644
index 000000000..fb1094dd9
--- /dev/null
+++ b/kernel/riscv64/axpy.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+	if ( da == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
new file mode 100644
index 000000000..5a7ba4b08
--- /dev/null
+++ b/kernel/riscv64/axpy_vector.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0, j=0, jx=0, jy=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+	FLOAT_V_T vy0, vy1;
+	BLASLONG stride_x, stride_y;
+
+	if (n < 0)  return(0);
+	if (da == 0.0) return(0);
+
+	if (inc_x == 1 && inc_y == 1) {
+
+		gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+
+		if (gvl <= n/2) {
+			for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) {
+				vx0 = VLEV_FLOAT(&x[j], gvl);
+				vy0 = VLEV_FLOAT(&y[j], gvl);
+				vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+				VSEV_FLOAT(&y[j], vy0, gvl);
+
+				vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+				vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+				vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+				VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+			}
+		}
+		//tail
+		for (; j < n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+
+			j += gvl;
+		}
+	}else if (inc_y == 1) {
+		stride_x = inc_x * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+			j += gvl;
+		}
+        }else if(inc_x == 1){
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}else{
+		stride_x = inc_x * sizeof(FLOAT);
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
new file mode 100644
index 000000000..7b4f04f30
--- /dev/null
+++ b/kernel/riscv64/copy.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		y[iy] = x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
new file mode 100644
index 000000000..ce44a20f3
--- /dev/null
+++ b/kernel/riscv64/copy_vector.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+	if(n < 0)  return(0);
+
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T v0, v1, v2, v3;
+        unsigned int gvl = 0;
+
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n*sizeof(FLOAT));
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG gvl3 = gvl * 3;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSEV_FLOAT(&y[j], v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl], v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl*2], v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl3], v3, gvl);
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSEV_FLOAT(&y[j], v0, gvl);
+                        j += gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        BLASLONG gvl3 = gvl * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLEV_FLOAT(&x[j+gvl*2], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLEV_FLOAT(&x[j+gvl3], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+                                j += gvl * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
new file mode 100644
index 000000000..bcbf942c0
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -0,0 +1,977 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*8  \n\t"\
+	"addi       t2,    %[PB], 2*8  \n\t"\
+	"addi       t3,    %[PB], 3*8  \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 2*8  \n\t"\
+	"addi       t5,    %[PA], 4*8  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 6*8  \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*8  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"fld        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e64,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 2*8  \n\t"
+				"addi       t5,    %[PA], 4*8  \n\t"
+				"addi       t6,    %[PA], 6*8  \n\t"
+				"addi       t1,    %[PB], 1*8  \n\t"
+				"addi       t2,    %[PB], 2*8  \n\t"
+				"addi       t3,    %[PB], 3*8  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"fld        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*8  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*8  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*8  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"fld        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*8  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    8*8  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    8*8  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"fld        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,     4*8  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"fld        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,     4*8  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 2*8   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 2*8   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 2*8   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 2*8   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     4*8  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     4*8  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     4*8  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     4*8  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
new file mode 100644
index 000000000..46a84ad18
--- /dev/null
+++ b/kernel/riscv64/dot.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
new file mode 100644
index 000000000..8b1ae278c
--- /dev/null
+++ b/kernel/riscv64/dot_vector.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#endif
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0, j=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+        FLOAT_V_T vr, vx, vy;
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }
+	return(dot);
+}
+
+
diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c
new file mode 100644
index 000000000..ef61b245b
--- /dev/null
+++ b/kernel/riscv64/gemv_n.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	ix = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = alpha * x[ix];
+		iy = 0;
+		for (i=0; i<m; i++)
+		{
+			y[iy] += temp * a_ptr[i];
+			iy += inc_y;
+		}
+		a_ptr += lda;
+		ix    += inc_x;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
new file mode 100644
index 000000000..bd4d23eae
--- /dev/null
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+
+	if(n < 0)  return(0);
+        FLOAT *a_ptr = a;
+        FLOAT temp = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        if(inc_y == 1){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSEV_FLOAT(&y[j], vy0, gvl);
+                        j += gvl;
+                }
+        }else{
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_t.c b/kernel/riscv64/gemv_t.c
new file mode 100644
index 000000000..169047b72
--- /dev/null
+++ b/kernel/riscv64/gemv_t.c
@@ -0,0 +1,68 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	iy = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = 0.0;
+		ix = 0;
+		for (i=0; i<m; i++)
+		{
+			temp += a_ptr[i] * x[ix];
+			ix    += inc_x;
+		}
+		y[iy] += alpha * temp;
+		iy += inc_y;
+		a_ptr += lda;
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
new file mode 100644
index 000000000..beca8dc0f
--- /dev/null
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp;
+
+        FLOAT_V_T va, vr, vx;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }else{
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        ix = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                                ix += inc_xv;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/iamax.c b/kernel/riscv64/iamax.c
new file mode 100644
index 000000000..8c016ce4d
--- /dev/null
+++ b/kernel/riscv64/iamax.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			max = i;
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
new file mode 100644
index 000000000..3aa64afc9
--- /dev/null
+++ b/kernel/riscv64/iamax_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c
new file mode 100644
index 000000000..155292bd5
--- /dev/null
+++ b/kernel/riscv64/iamin.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) )
+		{
+			min = i;
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
new file mode 100644
index 000000000..608f19a00
--- /dev/null
+++ b/kernel/riscv64/iamin_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c
new file mode 100644
index 000000000..5072dd16e
--- /dev/null
+++ b/kernel/riscv64/imax.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			max = i;
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
new file mode 100644
index 000000000..44af7101b
--- /dev/null
+++ b/kernel/riscv64/imax_vector.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+	FLOAT maxf=-FLT_MAX;
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                v_max_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c
new file mode 100644
index 000000000..598cba387
--- /dev/null
+++ b/kernel/riscv64/imin.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+*	 BLASTEST float
+* 	 BLASTEST double
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > minf )
+		{
+			min = i;
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
new file mode 100644
index 000000000..e6e0e9f9f
--- /dev/null
+++ b/kernel/riscv64/imin_vector.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c
new file mode 100644
index 000000000..8fe33e95b
--- /dev/null
+++ b/kernel/riscv64/izamax.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG max=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			max = i;
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
new file mode 100644
index 000000000..62c95d973
--- /dev/null
+++ b/kernel/riscv64/izamax_vector.c
@@ -0,0 +1,246 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx0, vx1, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max_index = VMVVX_UINT(0, gvl);
+        v_max = VFMVVF_FLOAT(-1, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element greater than v_max
+                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
+                v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl);
+
+                //update v_max and start_index j
+                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+        maxf = vx0[0];
+        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        max_index = VMFIRSTM(mask0,gvl);
+        max_index = v_max_index[max_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+                FLOAT cur_maxf = vx0[0];
+                if(cur_maxf > maxf){
+                        //tail index
+                        v_max_index = VIDV_UINT(gvl);
+                        v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        max_index = VMFIRSTM(mask0,gvl);
+                        max_index = v_max_index[max_index];
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c
new file mode 100644
index 000000000..fb5a0d4cb
--- /dev/null
+++ b/kernel/riscv64/izamin.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG min=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			min = i;
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
new file mode 100644
index 000000000..38eccf1b5
--- /dev/null
+++ b/kernel/riscv64/izamin_vector.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx0, vx1, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min_index = VMVVX_UINT(0, gvl);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element less than v_min
+                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
+                v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl);
+
+                //update v_min and start_index j
+                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+        minf = vx0[0];
+        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
+        min_index = VMFIRSTM(mask0,gvl);
+        min_index = v_min_index[min_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_min_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+                FLOAT cur_minf = vx0[0];
+                if(cur_minf < minf){
+                        //tail index
+                        v_min_index = VIDV_UINT(gvl);
+                        v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        min_index = VMFIRSTM(mask0,gvl);
+                        min_index = v_min_index[min_index];
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c
new file mode 100644
index 000000000..2ad956bc0
--- /dev/null
+++ b/kernel/riscv64/max.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
new file mode 100644
index 000000000..4ef75452d
--- /dev/null
+++ b/kernel/riscv64/max_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT maxf=-FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c
new file mode 100644
index 000000000..2812fe397
--- /dev/null
+++ b/kernel/riscv64/min.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] < minf )
+		{
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
new file mode 100644
index 000000000..83c965bfa
--- /dev/null
+++ b/kernel/riscv64/min_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
new file mode 100644
index 000000000..fcff09337
--- /dev/null
+++ b/kernel/riscv64/nrm2.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	FLOAT absxi = 0.0;
+
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	if ( n == 1 ) return( ABS(x[0]) );
+
+	n *= inc_x;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			absxi = ABS( x[i] );
+			if ( scale < absxi )
+			{
+				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+				scale = absxi ;
+			}
+			else
+			{
+				ssq += ( absxi/scale ) * ( absxi/scale );
+			}
+
+		}
+		i += inc_x;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
new file mode 100644
index 000000000..785c0d2f8
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector.c
@@ -0,0 +1,220 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                int idx = 0, inc_v = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
new file mode 100644
index 000000000..a3d15406c
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -0,0 +1,128 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUM_FLOAT vfredsumvs_float32xm8
+#define VFMACCVV_FLOAT vfmaccvv_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFDOTVV_FLOAT vfdotvv_float32xm8
+#define ABS fabsf
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUM_FLOAT vfredsumvs_float64xm8
+#define VFMACCVV_FLOAT vfmaccvv_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFDOTVV_FLOAT vfdotvv_float64xm8
+#define ABS fabs
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	double len = 0.0 ;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }
+	return(sqrt(len));
+}
+
+
diff --git a/kernel/riscv64/omatcopy_cn.c b/kernel/riscv64/omatcopy_cn.c
new file mode 100644
index 000000000..4d11b9125
--- /dev/null
+++ b/kernel/riscv64/omatcopy_cn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		for(j=0; j<rows; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_ct.c b/kernel/riscv64/omatcopy_ct.c
new file mode 100644
index 000000000..b2587813f
--- /dev/null
+++ b/kernel/riscv64/omatcopy_ct.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = 0.0;
+			}
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = aptr[j];
+			}
+			aptr += lda;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<rows; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rn.c b/kernel/riscv64/omatcopy_rn.c
new file mode 100644
index 000000000..57515e729
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<rows ; i++ )
+	{
+		for(j=0; j<cols; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rt.c b/kernel/riscv64/omatcopy_rt.c
new file mode 100644
index 000000000..9d58350d5
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rt.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<cols; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot.c b/kernel/riscv64/rot.c
new file mode 100644
index 000000000..18b4ca252
--- /dev/null
+++ b/kernel/riscv64/rot.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	while(i < n)
+	{
+		temp   = c*x[ix] + s*y[iy] ;
+		y[iy]  = c*y[iy] - s*x[ix] ;
+		x[ix]  = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
new file mode 100644
index 000000000..aeabca1ba
--- /dev/null
+++ b/kernel/riscv64/rot_vector.c
@@ -0,0 +1,196 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+
+	if(n <= 0)  return(0);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, vx, vy;
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/scal.c b/kernel/riscv64/scal.c
new file mode 100644
index 000000000..4ef49e293
--- /dev/null
+++ b/kernel/riscv64/scal.c
@@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+	
+
+	while(j < n)
+	{
+
+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
new file mode 100644
index 000000000..5152eea06
--- /dev/null
+++ b/kernel/riscv64/scal_vector.c
@@ -0,0 +1,133 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#define VFMULVF_FLOAT vfmulvf_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#define VFMULVF_FLOAT vfmulvf_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+
+        FLOAT_V_T v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                if(da == 0.0){
+                        memset(&x[0], 0, n * sizeof(FLOAT));
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLEV_FLOAT(&x[j], gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                                        v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v1, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }
+        }else{
+                if(da == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v0, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
+                        if(gvl < n / 2){
+                                BLASLONG inc_xv = gvl * inc_x;
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                                        v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, v1, gvl);
+                                        ix += inc_xv * 2;
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                j += gvl;
+                                ix += inc_x * gvl;
+                        }
+                }
+        }
+	return 0;
+}
+
+
diff --git a/kernel/riscv64/sgemm_kernel_16x4_c910v.c b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
new file mode 100644
index 000000000..83507e744
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
@@ -0,0 +1,1575 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL16x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL16x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL16x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL16x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"
+
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"
+
+
+#define KERNEL16x2_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+	
+
+#define KERNEL16x2_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+
+
+#define KERNEL16x2_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+        "addi       %[PB], %[PB], 2*4  \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"
+
+
+#define KERNEL16x2_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x4_I
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x4_MAINLOOP:                \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_MAINLOOP \n\t"
+				
+				"M16x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_E
+				
+				//tail
+				"M16x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     8*4  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     8*4  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M16x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&8){
+   		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+								
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+				
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v20", "v21", 
+				 "v24", "v25", "v28", "v29");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+   		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x2_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x2_I
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x2_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x2_MAINLOOP:                \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_MAINLOOP \n\t"
+				
+				"M16x2_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_E
+				
+				//tail
+				"M16x2_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x2_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x2_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 2*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     2*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmv.v.f   v9,   ft1          \n\t"				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x2_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"M16x2_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+
+	   }
+	   if(bm&8){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+	 	   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   res8 = res8 + ptrba[8] * loadb0;
+			   res9 = res9 + ptrba[9] * loadb0;
+			   res10 = res10 + ptrba[10] * loadb0;
+			   res11 = res11 + ptrba[11] * loadb0;
+
+			   res12 = res12 + ptrba[12] * loadb0;
+			   res13 = res13 + ptrba[13] * loadb0;
+			   res14 = res14 + ptrba[14] * loadb0;
+			   res15 = res15 + ptrba[15] * loadb0;
+			   
+			   ptrba += 16;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+		   
+   		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0[8] += res8;
+		   C0[9] += res9;
+		   C0[10] += res10;
+		   C0[11] += res11;
+		   C0[12] += res12;
+		   C0[13] += res13;
+		   C0[14] += res14;
+		   C0[15] += res15;
+		   
+		   C0 += 16;
+
+	   }
+	   
+	   if(bm&8){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
new file mode 100644
index 000000000..eac621fb2
--- /dev/null
+++ b/kernel/riscv64/swap.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/08/20 Saar
+*	 BLASTEST float		OK
+* 	 BLASTEST double	OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		temp  = x[ix] ;
+		x[ix] = y[iy] ;
+		y[iy] = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
new file mode 100644
index 000000000..9377bf4b9
--- /dev/null
+++ b/kernel/riscv64/swap_vector.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j+=gvl * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j+=gvl;
+                }
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        iy += inc_y * gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                        iy += inc_y * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L.c b/kernel/riscv64/symv_L.c
new file mode 100644
index 000000000..8f48d03f5
--- /dev/null
+++ b/kernel/riscv64/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if ( m != offset )
+		printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+	jx = 0;
+	jy = 0;
+
+	for (j=0; j<offset; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		y[jy] += temp1 * a[j*lda+j];
+		iy = jy;
+		ix = jx;
+		for (i=j+1; i<m; i++)
+		{
+			ix += inc_x;
+			iy += inc_y;
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			
+		}
+		y[jy] += alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
new file mode 100644
index 000000000..3c2647026
--- /dev/null
+++ b/kernel/riscv64/symv_L_vector.c
@@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        if(inc_x == 1 && inc_y == 1){
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }else if(inc_y == 1){
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                inc_xv = inc_x * gvl;
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }else{
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c
new file mode 100644
index 000000000..b5a0c96e9
--- /dev/null
+++ b/kernel/riscv64/symv_U.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if( m != offset )
+		printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+	BLASLONG m1 = m - offset;
+
+	jx = m1 * inc_x;
+	jy = m1 * inc_y;
+
+	for (j=m1; j<m; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		iy = 0;
+		ix = 0;
+		for (i=0; i<j; i++)
+		{
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			ix += inc_x;
+			iy += inc_y;
+			
+		}
+		y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
new file mode 100644
index 000000000..29e0e4b65
--- /dev/null
+++ b/kernel/riscv64/symv_U_vector.c
@@ -0,0 +1,264 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1){
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }else if(inc_y == 1){
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }else{
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c
new file mode 100644
index 000000000..a39bd7821
--- /dev/null
+++ b/kernel/riscv64/zamax.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
new file mode 100644
index 000000000..a6c742b14
--- /dev/null
+++ b/kernel/riscv64/zamax_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max = VFMVVF_FLOAT(0, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(0, gvl);
+        v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+        maxf = v_max[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(0, gvl);
+                v_max = VFREDMAXVS_FLOAT(v1, v0, gvl);
+                if(v_max[0] > maxf)
+                        maxf = v_max[0];
+        }
+        return(maxf);
+}
diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c
new file mode 100644
index 000000000..02eab3e75
--- /dev/null
+++ b/kernel/riscv64/zamin.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
new file mode 100644
index 000000000..44a7cf1dc
--- /dev/null
+++ b/kernel/riscv64/zamin_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        v_min = VFREDMINVS_FLOAT(v_min, v0, gvl);
+        minf = v_min[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min = VFREDMINVS_FLOAT(v1, v0, gvl);
+                if(v_min[0] < minf)
+                        minf = v_min[0];
+        }
+        return(minf);
+}
diff --git a/kernel/riscv64/zasum.c b/kernel/riscv64/zasum.c
new file mode 100644
index 000000000..61e85cae6
--- /dev/null
+++ b/kernel/riscv64/zasum.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CABS1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
new file mode 100644
index 000000000..d9fa88971
--- /dev/null
+++ b/kernel/riscv64/zasum_vector.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                asumf += v0[0];
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v_sum[0];
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zaxpby.c b/kernel/riscv64/zaxpby.c
new file mode 100644
index 000000000..445354416
--- /dev/null
+++ b/kernel/riscv64/zaxpby.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
new file mode 100644
index 000000000..1897ce417
--- /dev/null
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
+{
+	if (n <= 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+        stride_x = inc_x * 2 * sizeof(FLOAT);
+        stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        if(beta_r == 0.0 && beta_i == 0.0){
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        if(inc_y == 1){
+                                memset(&y[0], 0, 2 * n * sizeof(FLOAT));
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl * 2;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                        VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                        j += gvl;
+                                        iy += inc_y * gvl * 2;
+                                }
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                        }
+                }
+        }else{
+	        FLOAT_V_T v0, v1;
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0;i<n/gvl;i++){
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                                j += gvl;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
new file mode 100644
index 000000000..1dcaeac27
--- /dev/null
+++ b/kernel/riscv64/zaxpy.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/15 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
new file mode 100644
index 000000000..fb2656a1d
--- /dev/null
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+	if(da_r == 0.0 && da_i == 0.0) return(0);
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
new file mode 100644
index 000000000..07fe584c5
--- /dev/null
+++ b/kernel/riscv64/zcopy.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		y[iy]   = x[ix] ;
+		y[iy+1] = x[ix+1] ;
+		ix += inc_x2;
+		iy += inc_y2;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
new file mode 100644
index 000000000..6ed430931
--- /dev/null
+++ b/kernel/riscv64/zcopy_vector.c
@@ -0,0 +1,92 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT));
+        }else{
+                FLOAT_V_T vx0, vx1, vx2, vx3;
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i < n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                                vx2 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vx3 = VLSEV_FLOAT(&x[ix+1+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx2, gvl);
+                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vx3, gvl);
+
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_x * 2 * gvl;
+                        iy += inc_y * 2 * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zdot.c b/kernel/riscv64/zdot.c
new file mode 100644
index 000000000..733c235c6
--- /dev/null
+++ b/kernel/riscv64/zdot.c
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: FAIL
+* 	 BLASTEST double	: FAIL
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT dot[2];
+	OPENBLAS_COMPLEX_FLOAT result;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	dot[0]=0.0;
+	dot[1]=0.0;
+
+	CREAL(result) = 0.0 ;
+	CIMAG(result) = 0.0 ;
+
+	if ( n < 1 )  return(result);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
+		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
+#else
+		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
+		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
+#endif
+		ix  += inc_x2 ;
+		iy  += inc_y2 ;
+		i++ ;
+
+	}
+	CREAL(result) = dot[0];
+	CIMAG(result) = dot[1];
+	return(result);
+
+}
+
+
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
new file mode 100644
index 000000000..33efd07e7
--- /dev/null
+++ b/kernel/riscv64/zdot_vector.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFMSACVV_FLOAT vfmsacvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFMSACVV_FLOAT vfmsacvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT dot[2];
+        OPENBLAS_COMPLEX_FLOAT result;
+
+        dot[0]=0.0;
+        dot[1]=0.0;
+
+        CREAL(result) = 0.0;
+        CIMAG(result) = 0.0;
+
+        if ( n < 1 )  return(result);
+
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        vr0 = VFMVVF_FLOAT(0, gvl);
+        vr1 = VFMVVF_FLOAT(0, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+        for(i=0,j=0; i<n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+                vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, gvl);
+#if !defined(CONJ)
+                vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, gvl);
+#endif
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+        dot[0] += vr0[0];
+        vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+        dot[1] += vr1[0];
+        //tail
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+#if !defined(CONJ)
+                vr0 = VFMULVV_FLOAT(vx1, vy1, gvl);
+                vr0 = VFMSACVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMULVV_FLOAT(vx0, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMULVV_FLOAT(vx0, vy0, gvl);
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
+                vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
+#endif
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+                dot[0] += vr0[0];
+                vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+                dot[1] += vr1[0];
+        }
+        CREAL(result) = dot[0];
+        CIMAG(result) = dot[1];
+        return(result);
+}
diff --git a/kernel/riscv64/zgemv_n.c b/kernel/riscv64/zgemv_n.c
new file mode 100644
index 000000000..b9b03f792
--- /dev/null
+++ b/kernel/riscv64/zgemv_n.c
@@ -0,0 +1,157 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	ix = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += 2;
+		}
+		a_ptr += lda2;
+		ix    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += inc_y2;
+		}
+		a_ptr += lda2;
+		ix    += inc_x2;
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
new file mode 100644
index 000000000..31cbbe6bb
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+        FLOAT *a_ptr = a;
+        FLOAT temp_r = 0.0, temp_i = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_yv = inc_y * gvl * 2;
+        BLASLONG inc_x2 = inc_x * 2;
+        BLASLONG lda2 = lda * 2;
+        for(k=0,j=0; k<m/gvl; k++){
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl * 2;
+                iy += inc_yv;
+        }
+        //tail
+        if(j/2 < m){
+                gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_t.c b/kernel/riscv64/zgemv_t.c
new file mode 100644
index 000000000..1239cf3f7
--- /dev/null
+++ b/kernel/riscv64/zgemv_t.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	iy = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += 2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += inc_x2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += inc_y2;
+	}
+
+	return(0);
+
+}
+
+
+
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
new file mode 100644
index 000000000..b23a4d8a3
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp_r, temp_i;
+
+        FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        BLASLONG inc_av = gvl * 2;
+        BLASLONG inc_y2 = inc_y * 2;
+        BLASLONG lda2 = lda * 2;
+        for(i = 0; i < n; i++){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                j = 0;
+                ix = 0;
+                vr = VFMVVF_FLOAT(0, gvl);
+                vi = VFMVVF_FLOAT(0, gvl);
+                for(k = 0; k < m/gvl; k++){
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        j += inc_av;
+                        ix += inc_xv;
+                }
+                va0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                temp_r = vx0[0];
+                vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                temp_i = vx1[0];
+                if(j/2 < m){
+                        gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                        temp_r += vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                        temp_i += vx1[0];
+                }
+#if !defined(XCONJ)
+                y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+                y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+                y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+                y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+                iy += inc_y2;
+                a_ptr += lda2;
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
new file mode 100644
index 000000000..aa9ac85d5
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < m){
+				gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
new file mode 100644
index 000000000..6fe12c76c
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < j){
+				gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
new file mode 100644
index 000000000..fc1c8b54a
--- /dev/null
+++ b/kernel/riscv64/znrm2.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	BLASLONG inc_x2;
+	FLOAT temp;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			temp = ABS( x[i] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+		if ( x[i+1] != 0.0 )
+		{
+			temp = ABS( x[i+1] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+
+		i += inc_x2;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
new file mode 100644
index 000000000..b0ebfa5f4
--- /dev/null
+++ b/kernel/riscv64/znrm2_vector.c
@@ -0,0 +1,278 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+//        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n2/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n2){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                int idx = 0, inc_v = inc_x * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cn.c b/kernel/riscv64/zomatcopy_cn.c
new file mode 100644
index 000000000..f5a7a6284
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia+=2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cnc.c b/kernel/riscv64/zomatcopy_cnc.c
new file mode 100644
index 000000000..210c3f716
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ct.c b/kernel/riscv64/zomatcopy_ct.c
new file mode 100644
index 000000000..38bc9b9f7
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ct.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ctc.c b/kernel/riscv64/zomatcopy_ctc.c
new file mode 100644
index 000000000..34e7e919a
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ctc.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rn.c b/kernel/riscv64/zomatcopy_rn.c
new file mode 100644
index 000000000..ded381e15
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]    - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rnc.c b/kernel/riscv64/zomatcopy_rnc.c
new file mode 100644
index 000000000..fc27f17ec
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans , conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]    + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rt.c b/kernel/riscv64/zomatcopy_rt.c
new file mode 100644
index 000000000..d34db24e0
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rt.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rtc.c b/kernel/riscv64/zomatcopy_rtc.c
new file mode 100644
index 000000000..a80ee6dfe
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rtc.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot.c b/kernel/riscv64/zrot.c
new file mode 100644
index 000000000..98be68db8
--- /dev/null
+++ b/kernel/riscv64/zrot.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+		temp[0]   = c*x[ix]   + s*y[iy] ;
+		temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+		y[iy]     = c*y[iy]   - s*x[ix] ;
+		y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+		x[ix]     = temp[0] ;
+		x[ix+1]   = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
new file mode 100644
index 000000000..a3fdda45a
--- /dev/null
+++ b/kernel/riscv64/zrot_vector.c
@@ -0,0 +1,162 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+
+        if (n < 1)  return(0);
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+	if(inc_x==1 && inc_y==1){
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+
+			j += gvl;
+			ix += 2*gvl;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+						vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+		}
+		
+	}else{
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+			j += gvl;
+			ix += inc_xv;
+			iy += inc_yv;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+		}
+	}
+        return(0);
+}
diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
new file mode 100644
index 000000000..0521aaa0b
--- /dev/null
+++ b/kernel/riscv64/zscal.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG inc_x2;
+	BLASLONG ip = 0;
+	FLOAT temp;
+
+        if ( (n <= 0) || (inc_x <= 0))
+                return(0);
+
+
+	inc_x2 = 2 * inc_x;
+	for ( i=0; i<n; i++ )
+	{
+		if ( da_r == 0.0 )
+		{
+			if ( da_i == 0.0 )
+			{
+				temp = 0.0;
+				x[ip+1] = 0.0 ;
+			}
+			else
+			{
+				temp = - da_i * x[ip+1] ;
+				x[ip+1] = da_i * x[ip]  ;
+			}
+		}
+		else
+		{
+			if ( da_i == 0.0 )
+			{
+				temp    = da_r * x[ip]  ;
+				x[ip+1] = da_r * x[ip+1];
+			}
+			else
+			{
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+			}
+		}
+		x[ip]   = temp;
+
+		ip += inc_x2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
new file mode 100644
index 000000000..796c52a02
--- /dev/null
+++ b/kernel/riscv64/zscal_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+
+
+        if((n <= 0) || (inc_x <= 0))
+                return(0);
+
+        unsigned int gvl = 0;
+        FLOAT_V_T vt, v0, v1;
+        if(da_r == 0.0 && da_i == 0.0){
+                memset(&x[0], 0, n * 2 * sizeof(FLOAT));
+        }else if(da_r == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else if(da_i == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
new file mode 100644
index 000000000..ae4760ae0
--- /dev/null
+++ b/kernel/riscv64/zswap.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		temp[0]  = x[ix]   ;
+		temp[1]  = x[ix+1] ;
+		x[ix]    = y[iy]   ;
+		x[ix+1]  = y[iy+1] ;
+		y[iy]    = temp[0] ;
+		y[iy+1]  = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
new file mode 100644
index 000000000..b655a968c
--- /dev/null
+++ b/kernel/riscv64/zswap_vector.c
@@ -0,0 +1,117 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG n2 = n * 2;
+                if(gvl <= n2/2){
+                        for(i=0,j=0; i<n2/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * 2 * sizeof(FLOAT);
+                stride_y = inc_y * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl * 2;
+                BLASLONG inc_yv = inc_y * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 849a4194a..1e846a61c 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = {
   snrm2_kTS,  sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
   dsdot_kTS,
   srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
-  sgemv_nTS,  sgemv_tTS, sger_kTS,
+  sbgemv_nTS, sbgemv_tTS, sger_kTS,
   ssymv_LTS, ssymv_UTS,
 
   sbgemm_kernelTS, sbgemm_betaTS,
@@ -933,6 +933,77 @@ static void init_parameter(void) {
 
 }
 #else // (ARCH_ARM64)
+#if defined(ARCH_MIPS64)
+static void init_parameter(void) {
+  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+  TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+  TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+
+  TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+  TABLE_NAME.dgemm_r = 640;
+  TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+  TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+#ifdef EXPRECISION
+  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+  TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
+  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
+  TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
+  TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
+#endif
+
+#if defined(USE_GEMM3M)
+#ifdef CGEMM3M_DEFAULT_P
+  TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_P
+  TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_Q
+  TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_Q
+  TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_R
+  TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_R
+  TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
+#endif
+
+#ifdef EXPRECISION
+  TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
+  TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
+  TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
+#endif
+#endif
+}
+#else // (ARCH_MIPS64)
 #if (ARCH_POWER)
 static void init_parameter(void) {
 
@@ -1780,4 +1851,5 @@ static void init_parameter(void) {
 }
 #endif //POWER
 #endif //ZARCH
+#endif //(ARCH_MIPS64)
 #endif //(ARCH_ARM64)
diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h
index ef8fcb865..3802a91e1 100644
--- a/kernel/simd/intrin.h
+++ b/kernel/simd/intrin.h
@@ -47,7 +47,7 @@ extern "C" {
 #endif
 
 /** AVX **/
-#ifdef HAVE_AVX
+#if defined(HAVE_AVX) || defined(HAVE_FMA3)
 #include <immintrin.h>
 #endif
 
diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h
index 3f79646e0..fbe531417 100644
--- a/kernel/simd/intrin_avx.h
+++ b/kernel/simd/intrin_avx.h
@@ -12,6 +12,8 @@ typedef __m256d  v_f64;
  ***************************/
 #define v_add_f32 _mm256_add_ps
 #define v_add_f64 _mm256_add_pd
+#define v_sub_f32 _mm256_sub_ps
+#define v_sub_f64 _mm256_sub_pd
 #define v_mul_f32 _mm256_mul_ps
 #define v_mul_f64 _mm256_mul_pd
 
@@ -19,12 +21,20 @@ typedef __m256d  v_f64;
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm256_fmadd_ps
     #define v_muladd_f64 _mm256_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm256_fmsub_ps
+    #define v_mulsub_f64 _mm256_fmsub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // !HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h
index f00af53e9..8f38eedd9 100644
--- a/kernel/simd/intrin_avx512.h
+++ b/kernel/simd/intrin_avx512.h
@@ -12,11 +12,16 @@ typedef __m512d  v_f64;
  ***************************/
 #define v_add_f32 _mm512_add_ps
 #define v_add_f64 _mm512_add_pd
+#define v_sub_f32 _mm512_sub_ps
+#define v_sub_f64 _mm512_sub_pd
 #define v_mul_f32 _mm512_mul_ps
 #define v_mul_f64 _mm512_mul_pd
 // multiply and add, a*b + c
 #define v_muladd_f32 _mm512_fmadd_ps
 #define v_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define v_mulsub_f32 _mm512_fmsub_ps
+#define v_mulsub_f64 _mm512_fmsub_pd
 BLAS_FINLINE float v_sum_f32(v_f32 a)
 {
     __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h
index 22cef10ca..cd44599fe 100644
--- a/kernel/simd/intrin_neon.h
+++ b/kernel/simd/intrin_neon.h
@@ -18,6 +18,8 @@ typedef float32x4_t v_f32;
  ***************************/
 #define v_add_f32 vaddq_f32
 #define v_add_f64 vaddq_f64
+#define v_sub_f32 vsubq_f32
+#define v_sub_f64 vsubq_f64
 #define v_mul_f32 vmulq_f32
 #define v_mul_f64 vmulq_f64
 
@@ -26,16 +28,24 @@ typedef float32x4_t v_f32;
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vfmaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vfmaq_f32(vnegq_f32(c), a, b); }
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vmlaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vmlaq_f32(vnegq_f32(c), a, b); }
 #endif
 
 // FUSED F64
 #if V_SIMD_F64
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return vfmaq_f64(c, a, b); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return vfmaq_f64(vnegq_f64(c), a, b); }
 #endif
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h
index 06a3fe78b..6a542072e 100644
--- a/kernel/simd/intrin_sse.h
+++ b/kernel/simd/intrin_sse.h
@@ -12,22 +12,35 @@ typedef __m128d  v_f64;
  ***************************/
 #define v_add_f32 _mm_add_ps
 #define v_add_f64 _mm_add_pd
+#define v_sub_f32 _mm_sub_ps
+#define v_sub_f64 _mm_sub_pd
 #define v_mul_f32 _mm_mul_ps
 #define v_mul_f64 _mm_mul_pd
 #ifdef HAVE_FMA3
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_fmadd_ps
     #define v_muladd_f64 _mm_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_fmsub_ps
+    #define v_mulsub_f64 _mm_fmsub_pd
 #elif defined(HAVE_FMA4)
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_macc_ps
     #define v_muladd_f64 _mm_macc_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_msub_ps
+    #define v_mulsub_f64 _mm_msub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc
index 2e8319ce5..1a2e9671a 100644
--- a/kernel/sparc/KERNEL.sparc
+++ b/kernel/sparc/KERNEL.sparc
@@ -54,3 +54,13 @@ ZTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
 ZTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
+
+
+SDOTKERNEL = ../generic/dot.c
+SDSDOTKERNEL = ../generic/dot.c
+DSDOTKERNEL = ../generic/dot.c
+DDOTKERNEL = ../generic/dot.c
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+CSWAPKERNEL = ../arm/zswap.c
+ZSWAPKERNEL = ../arm/zswap.c
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 855e1ff8c..b92f480e9 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -384,6 +384,14 @@ endif
 
 GEMVDEP = ../l2param.h
 
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = sbgemv_t.c
+endif
+
 ifndef SGEMVNKERNEL
 SGEMVNKERNEL = sgemv_n.c
 endif
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index b979fc0ae..81eaf96ac 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -102,3 +102,6 @@ ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 
 SASUMKERNEL = sasum.c
 DASUMKERNEL = dasum.c
+
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 9b8b84c30..3d71584fe 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -27,3 +27,6 @@ ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 
 CSCALKERNEL    = ../arm/zscal.c
 ZSCALKERNEL    = ../arm/zscal.c
+
+CASUMKERNEL = casum.c
+ZASUMKERNEL = zasum.c
diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
new file mode 100644
index 000000000..1014ecc4d
--- /dev/null
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -0,0 +1,795 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#ifndef __BF16_COMMON_MACROS
+#define __BF16_COMMON_MACROS
+
+#include <immintrin.h>
+
+#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512)   \
+    reg256##_0 = _mm512_castps512_ps256(reg512##_0);  \
+    reg256##_1 = _mm512_castps512_ps256(reg512##_1);
+
+
+#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n)    \
+    regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n)       \
+    regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);
+
+#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]);
+
+#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n)     \
+    reg = _mm512_loadu_si512(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n)     \
+    reg = _mm256_loadu_si256(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n)      \
+    reg = _mm_loadu_si128(x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask)     \
+    reg = _mm512_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask)     \
+    reg = _mm256_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask)      \
+    reg = _mm_maskz_loadu_epi16(mask, x + idx_n);
+
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31
+*/
+#define BF16_INTERLEAVE_8x32(regArray)                                  \
+    regArray##_8  = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \
+    regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \
+    regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \
+    regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \
+    regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15
+*/
+#define BF16_INTERLEAVE_8x16(regArray)                                  \
+    regArray##_8  = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0  = _mm256_unpacklo_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_1  = _mm256_unpackhi_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_2  = _mm256_unpacklo_epi64(regArray##_10, regArray##_11);   \
+    regArray##_3  = _mm256_unpackhi_epi64(regArray##_10, regArray##_11);   \
+    regArray##_4  = _mm256_unpacklo_epi64(regArray##_12, regArray##_13);   \
+    regArray##_5  = _mm256_unpackhi_epi64(regArray##_12, regArray##_13);   \
+    regArray##_6  = _mm256_unpacklo_epi64(regArray##_14, regArray##_15);   \
+    regArray##_7  = _mm256_unpackhi_epi64(regArray##_14, regArray##_15);
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+*/
+#define BF16_INTERLEAVE_4x32(regArray)                                 \
+    regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+*/
+#define BF16_INTERLEAVE_4x16(regArray)                                 \
+    regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for x with 32 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31
+ 
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31
+*/
+#define BF16_INTERLEAVE_1x32(regArray)                                 \
+    regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3);
+
+
+/* 2-step interleave for x with 16 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15
+
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15
+*/
+#define BF16_INTERLEAVE_1x16(regArray)                                 \
+    regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3);
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers
+   |a0|a1|...|a14|a15|i0|i1|...|i14|i15|
+   |b0|b1|...|b14|b15|j0|j1|...|j14|j15|
+   |c0|c1|...|c14|c15|k0|k1|...|k14|k15|
+   |d0|d1|...|d14|d15|l0|l1|...|l14|l15|
+   |e0|e1|...|e14|e15|m0|m1|...|m14|m15|
+   |f0|f1|...|f14|f15|n0|n1|...|n14|n15|
+   |g0|g1|...|g14|g15|o0|o1|...|o14|o15|
+   |h0|h1|...|h14|h15|p0|p1|...|p14|p15|
+*/
+#define BF16_INTERLEAVE256_8x32(regArray)                                     \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0xee);  \
+    regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44);  \
+    regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee);  \
+    regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44);  \
+    regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee);
+
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers
+   |a0|a1|...|a14|a15|e0|e1|...|e14|e15|
+   |b0|b1|...|b14|b15|f0|f1|...|f14|f15|
+   |c0|c1|...|c14|c15|g0|g1|...|g14|g15|
+   |d0|d1|...|d14|d15|h0|h1|...|h14|h15|
+*/
+#define BF16_INTERLEAVE256_4x32(regArray)                                    \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0xee);
+
+
+#define BF16_PERMUTE_8x32(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_8x32_2(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_4x32(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3);
+
+
+#define BF16_PERMUTE_4x32_2(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3);
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3);
+
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x32(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray);  \
+    accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray);  \
+    accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray);  \
+    accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray);  \
+    accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray);  \
+    accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray);  \
+    accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_1x32(accumArray, matArray, xArray)                                                 \
+    accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 16 elements per row
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x16(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray);  \
+    accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray);  \
+    accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray);  \
+    accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray);  \
+    accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray);  \
+    accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray);  \
+    accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13|
+    |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13|
+    |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13|
+    |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13|
+    |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15|
+    |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15|
+    |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15|
+    |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12|
+    |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13|
+    |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12|
+    |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13|
+    |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14|
+    |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15|
+    |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14|
+    |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15|
+*/
+#define FP32_INTERLEAVE_8x16(regArray)                               \
+    regArray##_8  = _mm512_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \
+    regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15);
+
+#define FP32_INTERLEAVE_8x16_ARRAY(regArray)                               \
+    regArray[8]  = _mm512_unpacklo_ps(regArray[0], regArray[1]);  \
+    regArray[9]  = _mm512_unpacklo_ps(regArray[2], regArray[3]);  \
+    regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]);  \
+    regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]);  \
+    regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]);  \
+    regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]);  \
+    regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]);  \
+    regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]);  \
+                                                                     \
+    regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \
+    regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]);
+
+/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|
+    |c0|d0|c1|d1|c4|d4|c5|d5|
+    |e0|f0|e1|f1|e4|f4|e5|f5|
+    |g0|h0|g1|h1|g4|h4|g5|h5|
+    |a2|b2|a3|b3|a6|b6|a7|b7|
+    |c2|d2|c3|d3|c6|d6|c7|d7|
+    |e2|f2|e3|f3|e6|f6|e7|f7|
+    |g2|h2|g3|h3|g6|h6|g7|h7|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|
+    |a1|b1|c1|d1|a5|b5|c5|d5|
+    |e0|f0|g0|h0|e4|f4|g4|h4|
+    |e1|f1|g1|h1|e5|f5|g5|h5|
+    |a2|b2|c2|d2|a6|b6|c6|d6|
+    |a3|b3|c3|d3|a7|b7|c7|d7|
+    |e2|f2|g2|h2|e6|f6|g6|h6|
+    |e3|f3|g3|h3|e7|f7|g7|h7|
+*/
+#define FP32_INTERLEAVE_8x8(regArray)                                \
+    regArray##_8  = _mm256_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \
+    regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15);
+
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x16(regArray)                             \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6);
+
+#define FP32_ACCUM2_8x16_ARRAY(regArray)                             \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[1]);  \
+    regArray[2] = _mm512_add_ps(regArray[2], regArray[3]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[5]);  \
+    regArray[6] = _mm512_add_ps(regArray[6], regArray[7]);  \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[2]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[6]);
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x8(regArray)                              \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6);
+
+
+/* Store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                 \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr)));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                  \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                                  \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr)));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                                   \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                         \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr)));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                       \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                        \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                                \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                                 \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                           \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Masked store 16 (alpha * result) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Store 8 (alpha * result) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Masked store 8 (alpha * result) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Store 4 (alpha * result) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Masked store 4 (alpha * result) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Store 16 result to y
+*/
+#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 result to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 result to y
+*/
+#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 result to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 result to y
+*/
+#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 result to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+#endif
diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
new file mode 100644
index 000000000..a1bd76f33
--- /dev/null
+++ b/kernel/x86_64/casum.c
@@ -0,0 +1,144 @@
+#include "common.h"
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+#if defined(SKYLAKEX)
+#include "casum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_CASUM_KERNEL
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x = x1;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        temp0 = ABS_K(x[0]);
+        temp1 = ABS_K(x[1]);
+        temp2 = ABS_K(x[2]);
+        temp3 = ABS_K(x[3]);
+        temp4 = ABS_K(x[4]);
+        temp5 = ABS_K(x[5]);
+        temp6 = ABS_K(x[6]);
+        temp7 = ABS_K(x[7]);
+        
+        sum0 += temp0;
+        sum1 += temp1;
+        sum2 += temp2;
+        sum3 += temp3;
+        
+        sum0 += temp4;
+        sum1 += temp5;
+        sum2 += temp6;
+        sum3 += temp7;
+        
+        x+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = casum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
new file mode 100644
index 000000000..d51929f9f
--- /dev/null
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -0,0 +1,349 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_CASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+    if (n2 < 64) {
+        __m128 accum_10, accum_11, accum_12, accum_13;
+        __m128 abs_mask1;
+
+        accum_10 = _mm_setzero_ps();
+        accum_11 = _mm_setzero_ps();
+        accum_12 = _mm_setzero_ps();
+        accum_13 = _mm_setzero_ps();
+        
+        abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
+        abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
+                
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        
+        if (n2 >= 32){
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+            
+            _mm_prefetch(&x1[16], _MM_HINT_T0);
+            __m128 x04 = _mm_loadu_ps(&x1[16]);
+            __m128 x05 = _mm_loadu_ps(&x1[20]);
+            __m128 x06 = _mm_loadu_ps(&x1[24]);
+            __m128 x07 = _mm_loadu_ps(&x1[28]);
+
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            x02 = _mm_and_ps(x02, abs_mask1);
+            x03 = _mm_and_ps(x03, abs_mask1);
+            
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+
+            x04 = _mm_and_ps(x04, abs_mask1);
+            x05 = _mm_and_ps(x05, abs_mask1);
+            x06 = _mm_and_ps(x06, abs_mask1);
+            x07 = _mm_and_ps(x07, abs_mask1);
+            
+            accum_10 = _mm_add_ps(accum_10, x04);
+            accum_11 = _mm_add_ps(accum_11, x05);
+            accum_12 = _mm_add_ps(accum_12, x06);
+            accum_13 = _mm_add_ps(accum_13, x07);
+
+            n2 -= 32;
+            x1 += 32;
+        }
+
+        if (n2 >= 16) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            x02 = _mm_and_ps(x02, abs_mask1);
+            x03 = _mm_and_ps(x03, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+            
+            n2 -= 16;
+            x1 += 16;
+        }
+
+        if (n2 >= 8) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+
+            n2 -= 8;
+            x1 += 8;
+        }
+        
+        if (n2 >= 4) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            x00 = _mm_and_ps(x00, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+
+        if (n2) {
+            sumf += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        }
+
+        accum_10 = _mm_add_ps(accum_10, accum_11);
+        accum_12 = _mm_add_ps(accum_12, accum_13);
+        accum_10 = _mm_add_ps(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+
+        sumf += accum_10[0];
+    }
+    else {
+        __m512 accum_0, accum_1, accum_2, accum_3;
+        __m512 x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff);
+        
+        accum_0 = _mm512_setzero_ps();
+        accum_1 = _mm512_setzero_ps();
+        accum_2 = _mm512_setzero_ps();
+        accum_3 = _mm512_setzero_ps();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 256) {
+            if (n2 >= 128) {
+                x00 = _mm512_loadu_ps(&x1[  0]);
+                x01 = _mm512_loadu_ps(&x1[ 16]);
+                x02 = _mm512_loadu_ps(&x1[ 32]);
+                x03 = _mm512_loadu_ps(&x1[ 48]);
+                x04 = _mm512_loadu_ps(&x1[ 64]);
+                x05 = _mm512_loadu_ps(&x1[ 80]);
+                x06 = _mm512_loadu_ps(&x1[ 96]);
+                x07 = _mm512_loadu_ps(&x1[112]);
+
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                
+                x04 = _mm512_and_ps(x04, abs_mask);
+                x05 = _mm512_and_ps(x05, abs_mask);
+                x06 = _mm512_and_ps(x06, abs_mask);
+                x07 = _mm512_and_ps(x07, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                
+                n2 -= 128;
+                x1 += 128;
+            }
+
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x02 = _mm512_loadu_ps(&x1[32]);
+                x03 = _mm512_loadu_ps(&x1[48]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+
+            sumf =  _mm512_reduce_add_ps(accum_0);
+        }
+        // n2 >= 256, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
+
+            if (0 != align_header) {
+                uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_ps(&x1[  0]);
+            x01 = _mm512_load_ps(&x1[ 16]);
+            x02 = _mm512_load_ps(&x1[ 32]);
+            x03 = _mm512_load_ps(&x1[ 48]);
+            x04 = _mm512_load_ps(&x1[ 64]);
+            x05 = _mm512_load_ps(&x1[ 80]);
+            x06 = _mm512_load_ps(&x1[ 96]);
+            x07 = _mm512_load_ps(&x1[112]);
+            
+            n2 -= 128;
+            x1 += 128;
+
+            while (n2 >= 128) {
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                x00 = _mm512_load_ps(&x1[  0]);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                x01 = _mm512_load_ps(&x1[ 16]);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                x02 = _mm512_load_ps(&x1[ 32]);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                x03 = _mm512_load_ps(&x1[ 48]);
+                
+                x04 = _mm512_and_ps(x04, abs_mask);
+                x05 = _mm512_and_ps(x05, abs_mask);
+                x06 = _mm512_and_ps(x06, abs_mask);
+                x07 = _mm512_and_ps(x07, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                x04 = _mm512_load_ps(&x1[ 64]);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                x05 = _mm512_load_ps(&x1[ 80]);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                x06 = _mm512_load_ps(&x1[ 96]);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                x07 = _mm512_load_ps(&x1[112]);
+
+                n2 -= 128;
+                x1 += 128;
+            }
+            x00 = _mm512_and_ps(x00, abs_mask);
+            x01 = _mm512_and_ps(x01, abs_mask);
+            x02 = _mm512_and_ps(x02, abs_mask);
+            x03 = _mm512_and_ps(x03, abs_mask);
+            
+            accum_0 = _mm512_add_ps(accum_0, x00);
+            accum_1 = _mm512_add_ps(accum_1, x01);
+            accum_2 = _mm512_add_ps(accum_2, x02);
+            accum_3 = _mm512_add_ps(accum_3, x03);
+            
+            x04 = _mm512_and_ps(x04, abs_mask);
+            x05 = _mm512_and_ps(x05, abs_mask);
+            x06 = _mm512_and_ps(x06, abs_mask);
+            x07 = _mm512_and_ps(x07, abs_mask);
+            
+            accum_0 = _mm512_add_ps(accum_0, x04);
+            accum_1 = _mm512_add_ps(accum_1, x05);
+            accum_2 = _mm512_add_ps(accum_2, x06);
+            accum_3 = _mm512_add_ps(accum_3, x07);
+
+            if (n2 >= 64) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x02 = _mm512_load_ps(&x1[32]);
+                x03 = _mm512_load_ps(&x1[48]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+            sumf = _mm512_reduce_add_ps(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index 8a40ea4b9..ddec21383 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 }
 
 #endif
-
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = dasum_kernel(n, x);
-    } 
+    }
     else {
         n *= inc_x;
-       
-        while(i < n) {
+        while (i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
@@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     return(sumf);
 }
 
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+    FLOAT * dummy_b;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else 
+	    nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    } else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
+
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
new file mode 100644
index 000000000..66e9ff907
--- /dev/null
+++ b/kernel/x86_64/drot.c
@@ -0,0 +1,205 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "drot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "drot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_DROT_KERNEL
+#include "../simd/intrin.h"
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+#if V_SIMD_F64 && V_SIMD > 256
+    const int vstep = v_nlanes_f64;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f64 __c = v_setall_f64(c);
+    v_f64 __s = v_setall_f64(s);
+    v_f64 vx0, vx1, vx2, vx3;
+    v_f64 vy0, vy1, vy2, vy3;
+    v_f64 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f64(x + i);
+        vx1 = v_loadu_f64(x + i + vstep);
+        vx2 = v_loadu_f64(x + i + vstep * 2);
+        vx3 = v_loadu_f64(x + i + vstep * 3);
+        vy0 = v_loadu_f64(y + i);
+        vy1 = v_loadu_f64(y + i + vstep);
+        vy2 = v_loadu_f64(y + i + vstep * 2);
+        vy3 = v_loadu_f64(y + i + vstep * 3);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt1 = v_mul_f64(__s, vy1);
+        vt2 = v_mul_f64(__s, vy2);
+        vt3 = v_mul_f64(__s, vy3);
+
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        vt1 = v_muladd_f64(__c, vx1, vt1);
+        vt2 = v_muladd_f64(__c, vx2, vt2);
+        vt3 = v_muladd_f64(__c, vx3, vt3);
+
+        v_storeu_f64(x + i, vt0);
+        v_storeu_f64(x + i + vstep, vt1);
+        v_storeu_f64(x + i + vstep * 2, vt2);
+        v_storeu_f64(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt1 = v_mul_f64(__s, vx1);
+        vt2 = v_mul_f64(__s, vx2);
+        vt3 = v_mul_f64(__s, vx3);
+
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        vt1 = v_mulsub_f64(__c, vy1, vt1);
+        vt2 = v_mulsub_f64(__c, vy2, vt2);
+        vt3 = v_mulsub_f64(__c, vy3, vt3);
+
+        v_storeu_f64(y + i, vt0);
+        v_storeu_f64(y + i + vstep, vt1);
+        v_storeu_f64(y + i + vstep * 2, vt2);
+        v_storeu_f64(y + i + vstep * 3, vt3);
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f64(x + i);
+        vy0 = v_loadu_f64(y + i);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        v_storeu_f64(x + i, vt0);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        v_storeu_f64(y + i, vt0);
+    }
+#else
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+#endif
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            drot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((FLOAT *)args->alpha)[0], 
+            ((FLOAT *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c
new file mode 100644
index 000000000..72a87696e
--- /dev/null
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_4 = n&(~3);
+    BLASLONG tail_index_16 = n&(~15);
+
+    __m256d c_256, s_256;
+    if (n >= 4) {
+        c_256 = _mm256_set1_pd(c);
+        s_256 = _mm256_set1_pd(s);
+    }
+
+    __m256d x0, x1, x2, x3;
+    __m256d y0, y1, y2, y3;
+    __m256d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_16; i += 16) {
+        x0 = _mm256_loadu_pd(&x[i + 0]);
+        x1 = _mm256_loadu_pd(&x[i + 4]);
+        x2 = _mm256_loadu_pd(&x[i + 8]);
+        x3 = _mm256_loadu_pd(&x[i +12]);
+        y0 = _mm256_loadu_pd(&y[i + 0]);
+        y1 = _mm256_loadu_pd(&y[i + 4]);
+        y2 = _mm256_loadu_pd(&y[i + 8]);
+        y3 = _mm256_loadu_pd(&y[i +12]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t1 = _mm256_mul_pd(s_256, y1);
+        t2 = _mm256_mul_pd(s_256, y2);
+        t3 = _mm256_mul_pd(s_256, y3);
+
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        t1 = _mm256_fmadd_pd(c_256, x1, t1);
+        t2 = _mm256_fmadd_pd(c_256, x2, t2);
+        t3 = _mm256_fmadd_pd(c_256, x3, t3);
+
+        _mm256_storeu_pd(&x[i + 0], t0);
+        _mm256_storeu_pd(&x[i + 4], t1);
+        _mm256_storeu_pd(&x[i + 8], t2);
+        _mm256_storeu_pd(&x[i +12], t3);
+
+        t0 = _mm256_mul_pd(s_256, x0);
+        t1 = _mm256_mul_pd(s_256, x1);
+        t2 = _mm256_mul_pd(s_256, x2);
+        t3 = _mm256_mul_pd(s_256, x3);
+
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        t1 = _mm256_fmsub_pd(c_256, y1, t1);
+        t2 = _mm256_fmsub_pd(c_256, y2, t2);
+        t3 = _mm256_fmsub_pd(c_256, y3, t3);
+
+        _mm256_storeu_pd(&y[i + 0], t0);
+        _mm256_storeu_pd(&y[i + 4], t1);
+        _mm256_storeu_pd(&y[i + 8], t2);
+        _mm256_storeu_pd(&y[i +12], t3);
+
+    }
+
+    for (i = tail_index_16; i < tail_index_4; i += 4) {
+        x0 = _mm256_loadu_pd(&x[i]);
+        y0 = _mm256_loadu_pd(&y[i]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        _mm256_storeu_pd(&x[i], t0);
+        
+        t0 = _mm256_mul_pd(s_256, x0);
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        _mm256_storeu_pd(&y[i], t0);
+    }
+
+    for (i = tail_index_4; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
new file mode 100644
index 000000000..4e862e663
--- /dev/null
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -0,0 +1,94 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG n1 = n;
+    
+    BLASLONG tail_index_8 = 0;
+    BLASLONG tail_index_32 = 0;
+
+    __m512d c_512 = _mm512_set1_pd(c);
+    __m512d s_512 = _mm512_set1_pd(s);
+
+    tail_index_8 = n1 & (~7);
+    tail_index_32 = n1 & (~31);
+
+
+    __m512d x0, x1, x2, x3;
+    __m512d y0, y1, y2, y3;
+    __m512d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm512_loadu_pd(&x[i + 0]);
+        x1 = _mm512_loadu_pd(&x[i + 8]);
+        x2 = _mm512_loadu_pd(&x[i +16]);
+        x3 = _mm512_loadu_pd(&x[i +24]);
+        y0 = _mm512_loadu_pd(&y[i + 0]);
+        y1 = _mm512_loadu_pd(&y[i + 8]);
+        y2 = _mm512_loadu_pd(&y[i +16]);
+        y3 = _mm512_loadu_pd(&y[i +24]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t1 = _mm512_mul_pd(s_512, y1);
+        t2 = _mm512_mul_pd(s_512, y2);
+        t3 = _mm512_mul_pd(s_512, y3);
+
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        t1 = _mm512_fmadd_pd(c_512, x1, t1);
+        t2 = _mm512_fmadd_pd(c_512, x2, t2);
+        t3 = _mm512_fmadd_pd(c_512, x3, t3);
+
+        _mm512_storeu_pd(&x[i + 0], t0);
+        _mm512_storeu_pd(&x[i + 8], t1);
+        _mm512_storeu_pd(&x[i +16], t2);
+        _mm512_storeu_pd(&x[i +24], t3);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t1 = _mm512_mul_pd(s_512, x1);
+        t2 = _mm512_mul_pd(s_512, x2);
+        t3 = _mm512_mul_pd(s_512, x3);
+
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        t1 = _mm512_fmsub_pd(c_512, y1, t1);
+        t2 = _mm512_fmsub_pd(c_512, y2, t2);
+        t3 = _mm512_fmsub_pd(c_512, y3, t3);
+
+        _mm512_storeu_pd(&y[i + 0], t0);
+        _mm512_storeu_pd(&y[i + 8], t1);
+        _mm512_storeu_pd(&y[i +16], t2);
+        _mm512_storeu_pd(&y[i +24], t3);
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm512_loadu_pd(&x[i]);
+        y0 = _mm512_loadu_pd(&y[i]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        _mm512_storeu_pd(&x[i], t0);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        _mm512_storeu_pd(&y[i], t0);
+    }
+
+    if ((n1&7) > 0) {
+        unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
+	__m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
+	__m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
+	__m512d temp = _mm512_mul_pd(s_512, tail_y);
+	temp = _mm512_fmadd_pd(c_512, tail_x, temp);
+	_mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
+        temp = _mm512_mul_pd(s_512, tail_x);
+        temp = _mm512_fmsub_pd(c_512, tail_y, temp);
+        _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);	
+    }
+}
+#endif
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index 36ec4a737..d0cea9bee 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
 #endif
 
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = sasum_kernel(n, x);
     }
     else {
-
         n *= inc_x;
         while(i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
-
     }
+    return (sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT * ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
     return(sumf);
 }
diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c
new file mode 100644
index 000000000..18e64dc3f
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_n_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_N_ACCL_KERNEL
+static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_m;
+    float accum = 0.0;
+    float tmp_x = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG j=0; j<n; j++) {
+        offset_lda = lda * j;
+        offset_m = m * j;
+        for (BLASLONG i=0; i<m; i++) {
+            a_bf16[offset_m + i] = a[offset_lda + i];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+    for (BLASLONG i=0; i<m; i++) {
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[j*m + i] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_n(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
new file mode 100644
index 000000000..d875e0d96
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_N_ACCL_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y);
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+        } else {
+            sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
new file mode 100644
index 000000000..46e6d0ff9
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
@@ -0,0 +1,234 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_128x = m & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+    __m512i xArray_0;
+
+    __m512i ZERO512 = _mm512_setzero_si512();
+
+    unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa);
+    __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value);
+    unsigned int blend_lo_mask_value = ((unsigned int)0x55555555);
+    __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value);
+
+    __m512i M512_EPI32_8 = _mm512_set1_epi32(8);
+    __m512i idx_base_0   = _mm512_set_epi32(23,  7, 22,  6, 21,  5,  20,  4, 19,  3, 18,  2, 17,  1,  16,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_8);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+        accum512_2 = _mm512_setzero_ps();
+        accum512_3 = _mm512_setzero_ps();
+        accum512_4 = _mm512_setzero_ps();
+        accum512_5 = _mm512_setzero_ps();
+        accum512_6 = _mm512_setzero_ps();
+        accum512_7 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+ 
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m +  0)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3);
+            matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+            BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0)
+            BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0)
+            BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0)
+            BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0)
+            BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0)
+            BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+        accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3);
+        accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3);
+        accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5);
+        accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5);
+        accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7);
+        accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7);
+
+        STORE16_COMPLETE_RESULT(accum512_8,  y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9,  y+idx_m+16)
+        STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32)
+        STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48)
+        STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64)
+        STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80)
+        STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96)
+        STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112)
+    }
+
+    for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
+    }
+
+    if (tag_m_32x != m) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+        unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+        __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
+
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        if ((m-tag_m_32x) > 16) {
+            STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
+            STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
+        } else {
+            STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c
new file mode 100644
index 000000000..22b099116
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t.c
@@ -0,0 +1,142 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_t_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_T_ACCL_KERNEL
+static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_n;
+    float accum = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG i=0; i<m; i++)  {
+        offset_lda = lda * i;
+        offset_n = n * i;
+        for (BLASLONG j=0; j<n; j++) {
+            a_bf16[offset_n + j] = a[offset_lda + j];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+	for (BLASLONG i=0; i<m; i++) {
+		offset_n = n * i;
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[offset_n + j] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    // Switch m and n
+    BLASLONG t = m;
+    m = n;
+    n = t;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_t(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
new file mode 100644
index 000000000..23da2e809
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -0,0 +1,202 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_T_ACCL_KERNEL 1
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1 (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2 (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3 (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4 (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5 (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6 (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7 (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8 (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9 (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        } else {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
new file mode 100644
index 000000000..51e681add
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
@@ -0,0 +1,3082 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+    __m512i xArray;
+    __m512  result_0, result_1;
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+#endif
+
+    __m512i load_idx_lo   = _mm512_set_epi16(0, 15,  0, 14,  0, 13,  0, 12,  0, 11,  0, 10,  0,  9,  0,  8,\
+                                             0,  7,  0,  6,  0,  5,  0,  4,  0,  3,  0,  2,  0,  1,  0,  0);
+    __m512i M512_EPI16_16 = _mm512_set1_epi16(16);
+    __m512i load_idx_hi   = _mm512_add_epi16(load_idx_lo, M512_EPI16_16);
+
+    unsigned int interleve_mask_value = ((unsigned int) 0x55555555);
+    __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value);
+
+    xArray = _mm512_set1_epi16((short) x[0]);
+    xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray);
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]);  // Load 32 rows with n=1
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 16) {
+        result_0 = _mm512_setzero_ps();
+        result_1 = _mm512_setzero_ps();
+
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 32 rows with n=1
+        matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+        matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+        result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num));
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+        STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask)
+    } else if (tail_num > 8) {
+        __m256 result256_0 = _mm256_setzero_ps();
+        __m256 result256_1 = _mm256_setzero_ps();
+
+        __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo);
+        __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1);
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 16 rows with n=1
+        __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0);  // Expand the low 8 elements
+        __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0);  // Expand the high 8 elements
+
+        result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256);
+        result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256);
+
+        unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num));
+        __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+        STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+        STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask)
+    } else {
+        __m128 result128_0 = _mm_setzero_ps();
+        __m128 result128_1 = _mm_setzero_ps();
+
+        __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0);
+        __m128i M128_EPI16_4   = _mm_set1_epi16(4);
+        __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4);
+
+        __m128i xArray128 = _mm512_castsi512_si128(xArray);
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 8 rows with n=1
+        __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0);  // Expand the low 4 elements
+        __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0);  // Expand the high 4 elements
+
+        result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128);
+        result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128);
+
+        if (tail_num > 4) {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x)
+            STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask)
+        } else {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1;
+    __m512i xArray;
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
+    __mmask8 load_mask = *((__mmask8*) &load_mask_value);
+    xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x));
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]);     // Load 16 rows as n=2
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]);  // Load 16 rows as n=2
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (m - tag_m_32x >= 16) {
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]);     // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+
+        tag_m_32x += 16;
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 8) {
+        result_0 = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15)));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask)
+    } else if (tail_num == 8) {
+        __m256 result256 = _mm256_setzero_ps();
+
+        __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_32x)
+    } else {
+        __m256 result256 = _mm256_setzero_ps();
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7)));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask)
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0|
+    __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // x2| 0|x2| 0|...|x2| 0|
+
+    __m512i load_idx_base;
+    __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16;
+    M512_EPI16_2  = _mm512_set1_epi16(2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
+                                     22, 21, 19, 18, 16, 15, 13, 12, 10,  9,  7,  6,  4,  3,  1,  0);
+
+    if (tag_m_32x > 0) {
+        __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd;
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6;
+
+        unsigned int idx_blend_mask_value = ((unsigned int)0x80000000);
+        __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value);
+
+        load_idx01_1st = load_idx_base;
+        load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16);
+        load_idx2_1st  = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512());
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]);           // Load 10 rows with n=3 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]);  // Load 10 rows with n=3 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]);  // Load 10 rows with n=3 plus 2 element
+
+            matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1);  // Select the first 2 elements for each row
+            matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2);  // Select the first 2 elements for each row
+            matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st,  matrixArray_1);  // Select the third element for each row
+            matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd,  matrixArray_2);  // Select the third element for each row
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (tag_m_32x != m) {
+        __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd;
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6;
+        __m256 result256_0, result256_1;
+
+        unsigned short idx256_blend_mask_value = ((unsigned short)0x8000);
+        __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value);
+
+        load256_idx01_1st = _mm512_castsi512_si256(load_idx_base);
+        load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8));
+        load256_idx2_1st  = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); 
+        load256_idx2_2nd  = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2));
+        load256_idx2_2nd  = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256());
+
+        if (m - tag_m_32x > 15) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+
+            matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+            matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+            matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
+
+            matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+            matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+            matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+            matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+            STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8)
+
+            tag_m_32x += 16;
+        }
+
+        if (tag_m_32x != m) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+            BLASLONG tail_num = m-tag_m_32x;
+
+            if (tail_num > 10) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+                matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]);  // Load m-tag_m_32x-10 rows
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else if (tail_num > 5) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]);   // Load m-tag_m_32x-5 rows
+                matrixArray256_2 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]);       // Load m-tag_m_32x rows
+                matrixArray256_1 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            }
+
+            unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num)));
+            __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value);
+            __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1);
+            STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+    __m512i xArray_01, xArray_23, xArray_remix;
+    __m512  result;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1);
+    __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x);               // |x0|x1|x2|x3|0|0|0|0|
+    xArray_01 = _mm512_broadcastd_epi32(xTmp);                          // |x0|x1|x0|x1|...|x0|x1|
+    xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // |x2|x3|x2|x3|...|x2|x3|
+    unsigned short blend_mask_value = ((unsigned short)0xff00);
+    __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+    xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| 
+
+    if (tag_m_16x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]);      // Load 8 rows with n=4
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]);    // Load 8 rows with n=4
+
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1);  // |a0|a1|...|h0|h1|i0|i1|...|p0|p1|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1);  // |a2|a3|...|h2|h3|i2|i3|...|p2|p3|
+
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01);
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23);
+
+            STORE16_COMPLETE_RESULT(result, y+idx_m)
+        }
+    }
+
+    if (m - tag_m_16x > 7) {
+        result = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]);                // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+        tag_m_16x += 8;
+    }
+
+    BLASLONG tail_num = m-tag_m_16x;
+    if (tail_num != 0) {
+        result = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]);  // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value);
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask)
+    }
+
+    return 0;
+}
+
+// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_30x = m - (m%30);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|0|0|0|
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512  result_0, result_1;
+    __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+    __m512i xArray_4  = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4| 0|x4| 0|...|x4| 0|
+
+    __m512i M512_EPI16_2 = _mm512_set1_epi16(2);
+    __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0,  0,  0,  0,  0,  0,  0,  0, 58, 57, 53, 52, 48, 47, 43, 42, 
+                                                        38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10,  6,  5,  1,  0);
+    __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39);
+    __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f);
+
+    __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2);
+    __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2);
+
+    __m512i load_idx4_stage1_1st  = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2);
+    __m512i load_idx4_stage1_2nd  = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx4_stage1_3rd  = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+    __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2;
+    __m512i matrixArray_stage2_0, matrixArray_stage2_1;
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+    if (tag_m_30x > 0) {
+        unsigned short blend_mask_value_0 = ((unsigned short)0xf000);
+        __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0);
+        unsigned short blend_mask_value_1 = ((unsigned short)0x3f00);
+        __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]);  // Load 6 rows with n=5
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]);  // Load 6 rows with n=5
+            matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]);  // Load 6 rows with n=5
+
+            // Process the 0|1 elements
+            // Stage 1: Select the 0|1 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 0|1 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01);
+
+            // Process the 2|3 elements
+            // Stage 1: Select the 2|3 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 2|3 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23);
+
+            // Process the for 4 elements
+            // Stage 1: Select the 4 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 4 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0,  (__m512bh) xArray_4);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1,  (__m512bh) xArray_4);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask)
+        }
+    }
+
+    if (m - tag_m_30x > 11) {
+        BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12);
+        for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) {
+            unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4);
+            __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+
+            // Interleave the elements
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            // Calculate and accumulate the result
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask)
+            tag_m_30x += 12;
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_30x;
+    if (tail_num > 6) {
+        unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num)));
+        __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+        unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5));
+        __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value);
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]);           // Load 6 rows with n=5
+        matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]);  // Load x rows with n=5
+
+        // Interleave the elements
+        matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+        matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+        matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+        // Calculate and accumulate the result
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask)
+    } else {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_30x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]);       // Load 1 rows with n=5
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|x5|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512  result_0;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+        __m512i load_idx01_1st = _mm512_set_epi32( 0,  0,  0,  0,  0, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+        __m512i load_idx01_2nd = _mm512_set_epi32(13, 10,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0);
+
+        __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1);
+        __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1);
+
+        __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1);
+        __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1);
+
+        unsigned short blend_mask_value = ((unsigned short)0x0400);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register
+        load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd);
+        // Set the 11th element to be 0 as 0 is the correct index
+        load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd);
+
+        __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+        __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+        __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4|x5|x4|x5|...|x4|x5|
+
+        unsigned short permute_mask01_uint = (((unsigned short)0xf800));
+        __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint);
+        unsigned short permute_mask45_uint = (((unsigned short)0xfc00));
+        __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint);
+
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2;
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]);           // Load 5 rows with n=6 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]);   // Load 5 rows with n=6 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]);  // Load 5 rows with n=6 plus 2 element
+
+            // Stage 1: interleave for the a..k elements
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+
+            // Stage 2: interleave for the l..p elements and remix together
+            matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2);
+            matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2);
+
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1);
+            __m256i load_idx01_1st = _mm256_set_epi32( 0,  0, 15, 12,  9,  6,  3,  0);
+            __m256i load_idx01_2nd = _mm256_set_epi32( 5,  2,  0,  0,  0,  0,  0,  0);
+
+            __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1);
+            __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1);
+            unsigned char blend_mask_value = ((unsigned char)0x20);
+            __mmask8 blend_mask = *((__mmask8*) &blend_mask_value);
+            // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register
+            load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd);
+            // Set the 6th element to be 0 as 0 is the correct index
+            load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd);
+
+            __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1);
+            __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1);
+
+            unsigned char permute_mask01_uint = (((unsigned char)0xc0));
+            __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint);
+            unsigned char permute_mask45_uint = (((unsigned char)0xe0));
+            __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint);
+
+            __m256i matrixArray_0, matrixArray_1, matrixArray_2;
+            __m256i matrixArray_stage_0;
+            __m256  result256_0;
+
+            result256_0 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
+            matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
+
+            // Process the 0|1 elements
+            // Select the 0|1 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01));
+
+            // Process the 2|3 elements
+            // Select the 2|3 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23));
+
+            // Process the for 4 elements
+            // Select the 4|5 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]);       // Load 1 rows with n=6
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);               // |x0|x1|x2|x3|x4|x5|x6|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14,
+                                                     31, 13, 12, 11, 10,  9,  8,  7, 31,  6,  5,  4,  3,  2,  1,  0);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]);    // Load 4 rows with n=7
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]);    // Load 4 rows with n=7
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]);   // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2);                        // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7|
+            matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3);                        // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 3: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7));
+            __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]);       // Load 1 rows with n=7
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m128i x128 = _mm_loadu_si128(x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]);     // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]);   // Load 4 rows with n=8
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]);   // Load 4 rows with n=8
+            matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]);  // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 2: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4));
+            __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_loadu_si128(&a[(i)*8]);       // Load 1 rows with n=8
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_14x = m - (m%14);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|0 |0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_14x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
+        __m256i idx_base_0 = _mm256_set_epi16( 0,  0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10,  9,  1,  0);
+        __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2);
+        __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2);
+        __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2);
+        __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16,  6,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10,  9,  8,  7);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|0 |x8| 0| ... |x8| 0|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        unsigned short blend_mask_value = ((unsigned short)0x3f80);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]);                          // Load 3 rows with n=9 plus 5 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]);   // Load 3 rows with n=9 plus 4 elements
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]);                        // Load 3 rows with n=9 plus 5 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]);  // Load 3 rows with n=9 plus 4 elements
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x|
+            matrixArray_4       = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x|
+            matrixArray_5       = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5);                       // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_14x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_14x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_12x  = m - (m%12);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8));           // |x8|x9|0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_12x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
+        __m256i idx_base_0 = _mm256_set_epi32( 0,  0, 26, 21, 16, 10,  5,  0);
+        __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1);
+        __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1);
+        __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1);
+        __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16, 11, 10,  9,  8,  7,  6);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|x9|x8|x9| ... |x8|x9|
+
+        unsigned short blend_mask_value = ((unsigned short)0x0fc0);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short load_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]);     // Load 3 rows with n=10
+            matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]);   // Load 3 rows with n=10
+            matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]);   // Load 3 rows with n=10
+            matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]);   // Load 3 rows with n=10
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x|
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            // Stage 3: interleave per 256 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5);           // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_12x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned char load256_mask_value = (((unsigned char)0xff) >> 3);
+        __mmask8 load256_mask = *((__mmask8*) &load256_mask_value);
+        for (BLASLONG i = tag_m_12x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10| 0| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5;
+        M512_EPI16_2 = _mm512_set1_epi16(2);
+        M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+        M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2);
+        M512_EPI32_5 = _mm512_set1_epi32(5);
+
+        unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff);
+        __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value);
+        unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00);
+        __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value);
+        unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000);
+        __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi16( 0,  0, 49, 48, 38, 37, 27, 26, 16, 15,  5,  4, 47, 46, 36, 35,
+                                             25, 24, 14, 13,  3,  2, 45, 44, 34, 33, 23, 22, 12, 11,  1,  0);
+        idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4);
+        idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|0  |x10|0  | ... |x10|0  |
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]);                             // Load 2 rows with n=11 plus 10 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]);       // Load 2 rows with n=11 plus 1 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]);                         // Load 2 rows with n=11 plus 10 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]);   // Load 2 rows with n=11 plus 1 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]);                        // Load 2 rows with n=11 plus 10 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]);  // Load 2 rows with n=11 plus 1 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x |
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x |
+            matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1|
+            matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10|x11| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5;
+        M512_EPI32_1 = _mm512_set1_epi32(1);
+        M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1);
+        M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1);
+        M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2);
+
+        unsigned short BASE_MASK_10_value = ((unsigned short)0x001f);
+        __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value);
+        unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0);
+        __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value);
+        unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00);
+        __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14,  8,  2, 25, 19, 13,  7,  1,  24, 18, 12,  6,  0);
+        idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2);
+        idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|x11|x10|x11| ... |x10|x11|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]);                             // Load 2 rows with n=12 plus 8 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]);       // Load 2 rows with n=12 plus 4 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]);                         // Load 2 rows with n=12 plus 8 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]);   // Load 2 rows with n=12 plus 4 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]);                        // Load 2 rows with n=12 plus 8 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]);  // Load 2 rows with n=12 plus 4 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 |
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 |
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 |
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 |       
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+
+// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]);       // Load 1 rows with n=13
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+        __m512i shift_idx    = _mm512_set_epi32(0,  13, 12, 11, 10,  9,  8,  7,  0,  6,  5,  4,  3,  2,  1,  0);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_8x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_4x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]);       // Load 1 rows with n=14
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]);       // Load 1 rows with n=15
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m256i x256 = _mm256_loadu_si256(x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_8  = _mm512_loadu_si512(&a[(idx_m   )*16]);  // Load 2 rows with n=16
+            matrixArray_9  = _mm512_loadu_si512(&a[(idx_m+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]);  // Load 2 rows with n=16
+            matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]);  // Load 2 rows with n=16
+            matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]);  // Load 2 rows with n=16
+            matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]);  // Load 2 rows with n=16
+            matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2,  matrixArray256_3, \
+                    matrixArray256_4, matrixArray256_5, matrixArray256_6,  matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+
+            matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0);
+            matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1);
+            matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1);
+            matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_loadu_si256(&a[(i)*16]);       // Load 1 rows with n=16
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n));
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|...
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+            matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+    __m512  accum512_0, accum512_1, accum512_2, accum512_3;
+    __m256  accum256;
+    __m128  accum128;
+
+    if (tag_m_8x > 0) {
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = x512;
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 8 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            accum512_2 = _mm512_add_ps(accum512_2, accum512_3);
+            accum256   = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1));
+            STORE8_COMPLETE_RESULT(accum256, y+idx_m)
+        }
+
+        if (m - tag_m_8x > 3) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 4 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..d[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x)
+            tag_m_8x += 4;
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+            accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x   = m & (~7);
+    BLASLONG tag_n_32x  = n & (~31);
+    BLASLONG tag_n_128x = n & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m512 accum512_bridge[8];
+    __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3;
+    __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+    __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+    if (tag_m_8x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            for (int j = idx_m; j < idx_m + 8; j++) {
+                accum512_t_0 = _mm512_setzero_ps();
+                accum512_t_1 = _mm512_setzero_ps();
+                accum512_t_2 = _mm512_setzero_ps();
+                accum512_t_3 = _mm512_setzero_ps();
+                /* Processing the main chunk with 128-elements per round */
+                for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                    BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                    BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                    BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                    BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                    BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+                }
+
+                /* Processing the remaining <128 chunk with 32-elements per round */
+                for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                }
+
+                /* Processing the remaining <32 chunk with masked 32-elements processing */
+                if ((n&31) != 0) {
+                    BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                    BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+                }
+
+                /* Accumulate the 4 registers into 1 register */
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+                accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+                // Temply save the result into a ZMM
+                accum512_bridge[j-idx_m] = accum512_t_0;
+            }
+
+            FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge)
+            FP32_ACCUM2_8x16_ARRAY(accum512_bridge)
+            accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]);
+            accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]);
+            accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG j = tag_m_8x; j < m; j++) {
+            accum512_t_0 = _mm512_setzero_ps();
+            accum512_t_1 = _mm512_setzero_ps();
+            accum512_t_2 = _mm512_setzero_ps();
+            accum512_t_3 = _mm512_setzero_ps();
+            /* Processing the main chunk with 128-elements per round */
+            for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+            }
+
+            /* Processing the remaining <128 chunk with 32-elements per round */
+            for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+            }
+
+            /* Processing the remaining <32 chunk with masked 32-elements processing */
+            if ((n&31) != 0) {
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+            }
+
+            /* Accumulate the 4 registers into 1 register */
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+            accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[j] = alpha * accum128[0] + beta * y[j];
+#else
+            y[j] = alpha * accum128[0] + y[j];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[j] = accum128[0] * alpha;
+#else
+            y[j] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+    BLASLONG tag_n_32x = n & (~31);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m512i xArray_0;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    if (tag_m_8x > 0) {
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+            accum512_2 = _mm512_setzero_ps();
+            accum512_3 = _mm512_setzero_ps();
+            accum512_4 = _mm512_setzero_ps();
+            accum512_5 = _mm512_setzero_ps();
+            accum512_6 = _mm512_setzero_ps();
+            accum512_7 = _mm512_setzero_ps();
+
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 8 rows from matrix
+                BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n)
+
+                // Load x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            if (tag_n_32x != n) {         // Go with masked 512
+                // Load 8 rows from matrix
+                BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask)
+
+                // Load x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x16(accum512)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x16(accum512)
+
+            accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4);
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4);
+            accum512_1 = _mm512_add_ps(accum512_1, accum512_2);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 32 elements from matrix
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n)
+
+                // Load 32 elements from x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            if (tag_n_32x != n) {
+                // Load tail elements from matrix
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask)
+
+                // Load 32 elements from x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    __m256i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m256i xArray256;
+
+    // Keep align with other kernels and macro definition, the high 256bit is never used
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
+
+    __m256  accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
+            accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15;
+
+    __m256i M256_EPI32_4 = _mm256_set1_epi32(4);
+    __m256i idx_base_0   = _mm256_set_epi32(11, 10,  9,  8,  3,  2,  1,  0);
+    __m256i idx_base_1   = _mm256_add_epi32(idx_base_0, M256_EPI32_4);
+
+    unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n));
+    __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+
+    if (n == 16) {
+        BF16_VECTOR_LOAD_1x16(xArray256, x, 0)
+    } else {
+        BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask)
+    }
+
+    if (n == 16) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                y[i] += accum128[0] * alpha;
+            }
+        }
+    } else {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+                y[i] = alpha * accum128[0] + beta * y[i];
+#else
+                y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+                y[i] = accum128[0] * alpha;
+#else
+                y[i] = accum128[0];
+#endif
+#endif
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
new file mode 100644
index 000000000..3de586cb8
--- /dev/null
+++ b/kernel/x86_64/srot.c
@@ -0,0 +1,207 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "srot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "srot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_SROT_KERNEL
+#include"../simd/intrin.h"
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    
+#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
+    const int vstep = v_nlanes_f32;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f32 __c = v_setall_f32(c);
+    v_f32 __s = v_setall_f32(s);
+    v_f32 vx0, vx1, vx2, vx3;
+    v_f32 vy0, vy1, vy2, vy3;
+    v_f32 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f32(x + i);
+        vx1 = v_loadu_f32(x + i + vstep);
+        vx2 = v_loadu_f32(x + i + vstep * 2);
+        vx3 = v_loadu_f32(x + i + vstep * 3);
+        vy0 = v_loadu_f32(y + i);
+        vy1 = v_loadu_f32(y + i + vstep);
+        vy2 = v_loadu_f32(y + i + vstep * 2);
+        vy3 = v_loadu_f32(y + i + vstep * 3);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt1 = v_mul_f32(__s, vy1);
+        vt2 = v_mul_f32(__s, vy2);
+        vt3 = v_mul_f32(__s, vy3);
+
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        vt1 = v_muladd_f32(__c, vx1, vt1);
+        vt2 = v_muladd_f32(__c, vx2, vt2);
+        vt3 = v_muladd_f32(__c, vx3, vt3);
+
+        v_storeu_f32(x + i, vt0);
+        v_storeu_f32(x + i + vstep, vt1);
+        v_storeu_f32(x + i + vstep * 2, vt2);
+        v_storeu_f32(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt1 = v_mul_f32(__s, vx1);
+        vt2 = v_mul_f32(__s, vx2);
+        vt3 = v_mul_f32(__s, vx3);
+
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        vt1 = v_mulsub_f32(__c, vy1, vt1);
+        vt2 = v_mulsub_f32(__c, vy2, vt2);
+        vt3 = v_mulsub_f32(__c, vy3, vt3);
+
+        v_storeu_f32(y + i, vt0);
+        v_storeu_f32(y + i + vstep, vt1);
+        v_storeu_f32(y + i + vstep * 2, vt2);
+        v_storeu_f32(y + i + vstep * 3, vt3);
+
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f32(x + i);
+        vy0 = v_loadu_f32(y + i);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        v_storeu_f32(x + i, vt0);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        v_storeu_f32(y + i, vt0);
+    }
+#else
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+#endif
+
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            srot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((float *)args->alpha)[0], 
+            ((float *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
new file mode 100644
index 000000000..8e245cc8f
--- /dev/null
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_8 = n&(~7);
+    BLASLONG tail_index_32 = n&(~31);
+
+    __m256 c_256, s_256;
+    if (n >= 8) {
+        c_256 = _mm256_set1_ps(c);
+        s_256 = _mm256_set1_ps(s);
+    }
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+    __m256 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm256_loadu_ps(&x[i + 0]);
+        x1 = _mm256_loadu_ps(&x[i + 8]);
+        x2 = _mm256_loadu_ps(&x[i +16]);
+        x3 = _mm256_loadu_ps(&x[i +24]);
+        y0 = _mm256_loadu_ps(&y[i + 0]);
+        y1 = _mm256_loadu_ps(&y[i + 8]);
+        y2 = _mm256_loadu_ps(&y[i +16]);
+        y3 = _mm256_loadu_ps(&y[i +24]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t1 = _mm256_mul_ps(s_256, y1);
+        t2 = _mm256_mul_ps(s_256, y2);
+        t3 = _mm256_mul_ps(s_256, y3);
+
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
+        t1 = _mm256_fmadd_ps(c_256, x1, t1);
+        t2 = _mm256_fmadd_ps(c_256, x2, t2);
+        t3 = _mm256_fmadd_ps(c_256, x3, t3);
+
+        _mm256_storeu_ps(&x[i + 0], t0);
+        _mm256_storeu_ps(&x[i + 8], t1);
+        _mm256_storeu_ps(&x[i +16], t2);
+        _mm256_storeu_ps(&x[i +24], t3);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t1 = _mm256_mul_ps(s_256, x1);
+        t2 = _mm256_mul_ps(s_256, x2);
+        t3 = _mm256_mul_ps(s_256, x3);
+
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        t1 = _mm256_fmsub_ps(c_256, y1, t1);
+        t2 = _mm256_fmsub_ps(c_256, y2, t2);
+        t3 = _mm256_fmsub_ps(c_256, y3, t3);
+
+        _mm256_storeu_ps(&y[i + 0], t0);
+        _mm256_storeu_ps(&y[i + 8], t1);
+        _mm256_storeu_ps(&y[i +16], t2);
+        _mm256_storeu_ps(&y[i +24], t3);
+
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm256_loadu_ps(&x[i]);
+        y0 = _mm256_loadu_ps(&y[i]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
+        _mm256_storeu_ps(&x[i], t0);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        _mm256_storeu_ps(&y[i], t0);
+    }
+
+    for (i = tail_index_8; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
new file mode 100644
index 000000000..a21d1cf64
--- /dev/null
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -0,0 +1,91 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    __m512 c_512, s_512;
+    c_512 = _mm512_set1_ps(c);
+    s_512 = _mm512_set1_ps(s);
+
+    BLASLONG tail_index_16 = n&(~15);
+    BLASLONG tail_index_64 = n&(~63);
+
+
+    __m512 x0, x1, x2, x3;
+    __m512 y0, y1, y2, y3;
+    __m512 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_64; i += 64) {
+        x0 = _mm512_loadu_ps(&x[i + 0]);
+        x1 = _mm512_loadu_ps(&x[i +16]);
+        x2 = _mm512_loadu_ps(&x[i +32]);
+        x3 = _mm512_loadu_ps(&x[i +48]);
+        y0 = _mm512_loadu_ps(&y[i + 0]);
+        y1 = _mm512_loadu_ps(&y[i +16]);
+        y2 = _mm512_loadu_ps(&y[i +32]);
+        y3 = _mm512_loadu_ps(&y[i +48]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t1 = _mm512_mul_ps(s_512, y1);
+        t2 = _mm512_mul_ps(s_512, y2);
+        t3 = _mm512_mul_ps(s_512, y3);
+
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        t1 = _mm512_fmadd_ps(c_512, x1, t1);
+        t2 = _mm512_fmadd_ps(c_512, x2, t2);
+        t3 = _mm512_fmadd_ps(c_512, x3, t3);
+
+        _mm512_storeu_ps(&x[i + 0], t0);
+        _mm512_storeu_ps(&x[i +16], t1);
+        _mm512_storeu_ps(&x[i +32], t2);
+        _mm512_storeu_ps(&x[i +48], t3);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t1 = _mm512_mul_ps(s_512, x1);
+        t2 = _mm512_mul_ps(s_512, x2);
+        t3 = _mm512_mul_ps(s_512, x3);
+
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        t1 = _mm512_fmsub_ps(c_512, y1, t1);
+        t2 = _mm512_fmsub_ps(c_512, y2, t2);
+        t3 = _mm512_fmsub_ps(c_512, y3, t3);
+
+        _mm512_storeu_ps(&y[i + 0], t0);
+        _mm512_storeu_ps(&y[i +16], t1);
+        _mm512_storeu_ps(&y[i +32], t2);
+        _mm512_storeu_ps(&y[i +48], t3);
+    }
+
+    for (i = tail_index_64; i < tail_index_16; i += 16) {
+        x0 = _mm512_loadu_ps(&x[i]);
+        y0 = _mm512_loadu_ps(&y[i]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        _mm512_storeu_ps(&x[i], t0);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        _mm512_storeu_ps(&y[i], t0);
+    }
+
+
+    if ((n & 15) > 0) {
+        uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
+        __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
+        __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
+	    __m512 temp = _mm512_mul_ps(s_512, tail_y);
+	    temp = _mm512_fmadd_ps(c_512, tail_x, temp);
+	    _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
+	    temp = _mm512_mul_ps(s_512, tail_x);
+	    temp = _mm512_fmsub_ps(c_512, tail_y, temp);
+	    _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);	
+    }
+}
+#endif
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
new file mode 100644
index 000000000..6e758e2e3
--- /dev/null
+++ b/kernel/x86_64/zasum.c
@@ -0,0 +1,144 @@
+#include "common.h"
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+#if defined(SKYLAKEX)
+#include "zasum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_ZASUM_KERNEL
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        temp0 = ABS_K(x1[0]);
+        temp1 = ABS_K(x1[1]);
+        temp2 = ABS_K(x1[2]);
+        temp3 = ABS_K(x1[3]);
+        temp4 = ABS_K(x1[4]);
+        temp5 = ABS_K(x1[5]);
+        temp6 = ABS_K(x1[6]);
+        temp7 = ABS_K(x1[7]);
+        
+        sum0 += temp0;
+        sum1 += temp1;
+        sum2 += temp2;
+        sum3 += temp3;
+        
+        sum0 += temp4;
+        sum1 += temp5;
+        sum2 += temp6;
+        sum3 += temp7;
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = zasum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
new file mode 100644
index 000000000..b44c53801
--- /dev/null
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -0,0 +1,340 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_ZASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+
+    if (n2 < 32) {
+        __m128d accum_10, accum_11, accum_12, accum_13;
+        __m128d abs_mask1;
+
+        accum_10 = _mm_setzero_pd();
+        accum_11 = _mm_setzero_pd();
+        accum_12 = _mm_setzero_pd();
+        accum_13 = _mm_setzero_pd();
+        
+        // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
+        abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
+        abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
+                
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        if (n2 >= 16){
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+            
+            _mm_prefetch(&x1[8], _MM_HINT_T0);
+            __m128d x04 = _mm_loadu_pd(&x1[ 8]);
+            __m128d x05 = _mm_loadu_pd(&x1[10]);
+            __m128d x06 = _mm_loadu_pd(&x1[12]);
+            __m128d x07 = _mm_loadu_pd(&x1[14]);
+
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            x02 = _mm_and_pd(x02, abs_mask1);
+            x03 = _mm_and_pd(x03, abs_mask1);
+            
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+
+            x04 = _mm_and_pd(x04, abs_mask1);
+            x05 = _mm_and_pd(x05, abs_mask1);
+            x06 = _mm_and_pd(x06, abs_mask1);
+            x07 = _mm_and_pd(x07, abs_mask1);
+            
+            accum_10 = _mm_add_pd(accum_10, x04);
+            accum_11 = _mm_add_pd(accum_11, x05);
+            accum_12 = _mm_add_pd(accum_12, x06);
+            accum_13 = _mm_add_pd(accum_13, x07);
+
+            x1 += 16;
+            n2 -= 16;
+        }
+
+        if (n2 >= 8) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            x02 = _mm_and_pd(x02, abs_mask1);
+            x03 = _mm_and_pd(x03, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+            
+            n2 -= 8;
+            x1 += 8;
+        }
+
+        if (n2 >= 4) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+        
+        if (n2) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            x00 = _mm_and_pd(x00, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+        }
+
+        accum_10 = _mm_add_pd(accum_10, accum_11);
+        accum_12 = _mm_add_pd(accum_12, accum_13);
+        accum_10 = _mm_add_pd(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_pd(accum_10, accum_10);
+
+        sumf = accum_10[0];
+    }
+    else {
+        __m512d accum_0, accum_1, accum_2, accum_3;
+        __m512d x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff);
+        
+        accum_0 = _mm512_setzero_pd();
+        accum_1 = _mm512_setzero_pd();
+        accum_2 = _mm512_setzero_pd();
+        accum_3 = _mm512_setzero_pd();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 128) {
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x04 = _mm512_loadu_pd(&x1[32]);
+                x05 = _mm512_loadu_pd(&x1[40]);
+                x06 = _mm512_loadu_pd(&x1[48]);
+                x07 = _mm512_loadu_pd(&x1[56]);
+
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                
+                x04 = _mm512_and_pd(x04, abs_mask);
+                x05 = _mm512_and_pd(x05, abs_mask);
+                x06 = _mm512_and_pd(x06, abs_mask);
+                x07 = _mm512_and_pd(x07, abs_mask);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf =  _mm512_reduce_add_pd(accum_0);
+        }
+        // n2 >= 128, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
+
+            if (0 != align_header) {
+                unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_pd(&x1[ 0]);
+            x01 = _mm512_load_pd(&x1[ 8]);
+            x02 = _mm512_load_pd(&x1[16]);
+            x03 = _mm512_load_pd(&x1[24]);
+            x04 = _mm512_load_pd(&x1[32]);
+            x05 = _mm512_load_pd(&x1[40]);
+            x06 = _mm512_load_pd(&x1[48]);
+            x07 = _mm512_load_pd(&x1[56]);
+            
+            n2 -= 64;
+            x1 += 64;
+
+            while (n2 >= 64) {
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                x02 = _mm512_load_pd(&x1[16]);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                x03 = _mm512_load_pd(&x1[24]);
+                
+                x04 = _mm512_and_pd(x04, abs_mask);
+                x05 = _mm512_and_pd(x05, abs_mask);
+                x06 = _mm512_and_pd(x06, abs_mask);
+                x07 = _mm512_and_pd(x07, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                x04 = _mm512_load_pd(&x1[32]);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                x05 = _mm512_load_pd(&x1[40]);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                x06 = _mm512_load_pd(&x1[48]);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                x07 = _mm512_load_pd(&x1[56]);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+            x00 = _mm512_and_pd(x00, abs_mask);
+            x01 = _mm512_and_pd(x01, abs_mask);
+            x02 = _mm512_and_pd(x02, abs_mask);
+            x03 = _mm512_and_pd(x03, abs_mask);
+            
+            accum_0 = _mm512_add_pd(accum_0, x00);
+            accum_1 = _mm512_add_pd(accum_1, x01);
+            accum_2 = _mm512_add_pd(accum_2, x02);
+            accum_3 = _mm512_add_pd(accum_3, x03);
+            
+            x04 = _mm512_and_pd(x04, abs_mask);
+            x05 = _mm512_and_pd(x05, abs_mask);
+            x06 = _mm512_and_pd(x06, abs_mask);
+            x07 = _mm512_and_pd(x07, abs_mask);
+            
+            accum_0 = _mm512_add_pd(accum_0, x04);
+            accum_1 = _mm512_add_pd(accum_1, x05);
+            accum_2 = _mm512_add_pd(accum_2, x06);
+            accum_3 = _mm512_add_pd(accum_3, x07);
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x02 = _mm512_load_pd(&x1[16]);
+                x03 = _mm512_load_pd(&x1[24]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf = _mm512_reduce_add_pd(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/riscv64/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index f3ddde6a1..6c5e0f107 100644
--- a/param.h
+++ b/param.h
@@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86
@@ -1454,22 +1455,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 768
 #define SGEMM_DEFAULT_R sgemm_r
-//#define SGEMM_DEFAULT_R 1024
+/*#define SGEMM_DEFAULT_R 1024*/
 
 #define DGEMM_DEFAULT_P 512
 #define DGEMM_DEFAULT_R dgemm_r
-//#define DGEMM_DEFAULT_R 1024
+/*#define DGEMM_DEFAULT_R 1024*/
 
 #define QGEMM_DEFAULT_P 504
 #define QGEMM_DEFAULT_R qgemm_r
 
 #define CGEMM_DEFAULT_P 768
 #define CGEMM_DEFAULT_R cgemm_r
-//#define CGEMM_DEFAULT_R 1024
+/*#define CGEMM_DEFAULT_R 1024*/
 
 #define ZGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_R zgemm_r
-//#define ZGEMM_DEFAULT_R 1024
+/*#define ZGEMM_DEFAULT_R 1024*/
 
 #define XGEMM_DEFAULT_P 252
 #define XGEMM_DEFAULT_R xgemm_r
@@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86
@@ -2388,7 +2390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER9) || defined(POWER10)
+#if defined(POWER9)
 
 #define SNUMOPT		16
 #define DNUMOPT		8
@@ -2426,6 +2428,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #if defined(POWER10)
+#define SNUMOPT		16
+#define DNUMOPT		8
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
+
+#define SGEMM_DEFAULT_P 832
+#define DGEMM_DEFAULT_P 320
+#define CGEMM_DEFAULT_P  512
+#define ZGEMM_DEFAULT_P 256
+
+#define SGEMM_DEFAULT_Q 1026
+#define DGEMM_DEFAULT_Q 960
+#define CGEMM_DEFAULT_Q  1026
+#define ZGEMM_DEFAULT_Q 1026
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	 8
+
 #undef SBGEMM_DEFAULT_UNROLL_N
 #undef SBGEMM_DEFAULT_UNROLL_M
 #undef SBGEMM_DEFAULT_P
@@ -2537,7 +2572,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3A
+#if defined(LOONGSON3R4)
+#define SNUMOPT		2
+#define DNUMOPT		2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#ifdef HAVE_MSA
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  4
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+#endif
+
+#define SGEMM_DEFAULT_P	64
+#define DGEMM_DEFAULT_P	44
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 32
+
+#define SGEMM_DEFAULT_Q 192
+#define DGEMM_DEFAULT_Q 92
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 80
+
+#define SGEMM_DEFAULT_R 640
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R 640
+#define ZGEMM_DEFAULT_R 640
+
+#define GEMM_OFFSET_A1	0x10000
+#define	GEMM_OFFSET_B1	0x100000
+
+#define SYMV_P	16
+#endif
+
+#if defined(LOONGSON3R3)
 ////Copy from SICORTEX
 #define SNUMOPT		2
 #define DNUMOPT		2
@@ -2579,47 +2669,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3B
-#define SNUMOPT		2
-#define DNUMOPT		2
-
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
-
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
-
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  2
-
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
-
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
-
-#define SGEMM_DEFAULT_P	64
-#define DGEMM_DEFAULT_P	24
-#define CGEMM_DEFAULT_P 24
-#define ZGEMM_DEFAULT_P 20
-
-#define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 128
-#define ZGEMM_DEFAULT_Q 64
-
-#define SGEMM_DEFAULT_R 512
-#define DGEMM_DEFAULT_R 512
-#define CGEMM_DEFAULT_R 512
-#define ZGEMM_DEFAULT_R 512
-
-#define GEMM_OFFSET_A1	0x10000
-#define	GEMM_OFFSET_B1	0x100000
-
-#define SYMV_P	16
-#endif
-
 #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
 #define SNUMOPT  2
 #define DNUMOPT  2
@@ -2672,6 +2721,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  16
 #endif
 
+#ifdef RISCV64_GENERIC
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
+
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
+#ifdef C910V
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2
@@ -2752,7 +2879,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-// Common ARMv8 parameters
+/* Common ARMv8 parameters */
 #if defined(ARMV8)
 
 #define SNUMOPT		2
@@ -2955,7 +3082,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#else // Other/undetected ARMv8 cores
+#else /* Other/undetected ARMv8 cores */
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2984,9 +3111,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#endif // Cores
+#endif /* Cores */
 
-#endif // ARMv8
+#endif /* ARMv8 */
 
 #if defined(ARMV5)
 #define SNUMOPT		2
diff --git a/test/Makefile b/test/Makefile
index eb3bc3447..5f653414a 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -23,7 +23,7 @@ endif
 
 level1: $(S1) $(D1) $(C1) $(Z1)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
 endif
@@ -83,7 +83,7 @@ endif
 level2: $(S2) $(D2) $(C2) $(Z2)
 
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 	rm -f ?BLAT2.SUMM
 ifeq ($(BUILD_SINGLE),1)
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
@@ -160,7 +160,7 @@ endif
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
 
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 	rm -f ?BLAT3.SUMM
 ifeq ($(BUILD_BFLOAT16),1)
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM
@@ -232,7 +232,7 @@ endif
 
 
 level3_3m : zblat3_3m cblat3_3m
-ifndef CROSS
+ifneq ($(CROSS), 1)
 	rm -f ?BLAT3_3M.SUMM
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
 	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
@@ -258,6 +258,12 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+
+ifeq ($(CORE), C910V)
+EXTRALIB =
+CEXTRALIB =
+endif
+
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dc5175fc5..0c99e0d12 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -27,13 +27,17 @@ endif ()
 
 # known to hang with the native Windows and Android threads
 # FIXME needs checking if this works on any of the other platforms
-if (NOT USE_OPENMP)
 if (OS_CYGWIN_NT OR OS_LINUX)
+if (NOT USE_OPENMP)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_fork.c
   )
 endif()
+set(OpenBLAS_utest_src
+  ${OpenBLAS_utest_src}
+  test_post_fork.c
+  )
 endif()
 
 if (NOT NO_LAPACK)
@@ -54,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
 
 target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME})
 
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
 target_link_libraries(${OpenBLAS_utest_bin} m)
 endif()
 
diff --git a/utest/Makefile b/utest/Makefile
index 31d4ccf00..fad3607de 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -25,15 +25,19 @@ endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch
-ifndef USE_OPENMP
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
+ifneq ($(USE_OPENMP), 1)
 OBJS += test_fork.o
 endif
+OBJS += test_post_fork.o
 endif
 
 ifeq ($(C_COMPILER), PGI)
 OBJS = utest_main2.o
 endif
+ifeq ($(C_COMPILER), SUN)
+OBJS = utest_main2.o
+endif
 ifeq ($(OSNAME), AIX)
 OBJS = utest_main2.o
 endif
@@ -44,7 +48,7 @@ $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
 
 run_test: $(UTESTBIN)
-ifndef CROSS
+ifneq ($(CROSS), 1)
 	./$(UTESTBIN)
 endif
 
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 5c976f920..bd531e7fb 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cblas.h>
 #include "openblas_utest.h"
 
-void* xmalloc(size_t n)
+static void* xmalloc(size_t n)
 {
     void* tmp;
     tmp = malloc(n);
@@ -49,7 +49,7 @@ void* xmalloc(size_t n)
 }
 
 #ifdef BUILD_DOUBLE
-void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
 {
     char trans1 = 'T';
     char trans2 = 'N';
diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c
new file mode 100644
index 000000000..9370a02ce
--- /dev/null
+++ b/utest/test_post_fork.c
@@ -0,0 +1,131 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <cblas.h>
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+#include "openblas_utest.h"
+
+static void* xmalloc(size_t n)
+{
+    void* tmp;
+    tmp = malloc(n);
+    if (tmp == NULL) {
+        fprintf(stderr, "You are about to die\n");
+        exit(1);
+    } else {
+        return tmp;
+    }
+}
+
+#ifdef BUILD_DOUBLE
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+{
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
+    int i;
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
+    for(i = 0; i < n * n; ++i) {
+        ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+CTEST(fork, safety_after_fork_in_parent)
+{
+#ifndef BUILD_DOUBLE
+exit(0);
+#else
+    blasint n = 100;
+    int i, nthreads_omp;
+
+    double *a, *b, *c, *d;
+    size_t n_bytes;
+
+    pid_t fork_pid;
+
+    n_bytes = sizeof(*a) * n * n;
+
+    a = xmalloc(n_bytes);
+    b = xmalloc(n_bytes);
+    c = xmalloc(n_bytes);
+    d = xmalloc(n_bytes);
+
+    // Put ones in a, b and n in c (result)
+    for(i = 0; i < n * n; ++i) {
+        a[i] = 1;
+        b[i] = 1;
+        c[i] = 1 * n;
+    }
+
+    // Test that OpenBLAS works after a fork.
+    // This situation routinely happens with Pythons numpy where a
+    // `sys.platform` calls `uname` in a forked process.
+    // So we simulate this situation here.
+
+    // There was an issue where a different number of OpenBLAS and OpenMP
+    // threads triggered a memory leak. So run this multiple times
+    // with different number of threads set.
+#ifdef USE_OPENMP
+    nthreads_omp = omp_get_max_threads();
+    // Run with half the max OMP threads, the max threads and twice that
+    for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) {
+        omp_set_num_threads(i);
+#endif
+
+        fork_pid = fork();
+        if (fork_pid == -1) {
+            CTEST_ERR("Failed to fork process.");
+        } else if (fork_pid == 0) {
+            // Just pretend to do something, e.g. call `uname`, then exit
+            exit(0);
+        } else {
+            // Wait for the child to finish and check the exit code.
+            int child_status = 0;
+            pid_t wait_pid = wait(&child_status);
+            ASSERT_EQUAL(wait_pid, fork_pid);
+            ASSERT_EQUAL(0, WEXITSTATUS (child_status));
+
+            // Now OpenBLAS has to work
+            check_dgemm(a, b, d, c, n);
+        }
+#ifdef USE_OPENMP
+    }
+#endif
+
+#endif
+}