diff --git a/Makefile.install b/Makefile.install
index 7c1a3ca43..e8b64465f 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
 OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
 OPENBLAS_BINARY_DIR := $(PREFIX)/bin
 OPENBLAS_BUILD_DIR := $(CURDIR)
-OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
+OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
 OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
 OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
@@ -150,13 +150,13 @@ endif
 endif
 
 #Generating openblas.pc
-	@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
-	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
+	@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
+	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 
 
 #Generating OpenBLASConfig.cmake
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 48fb5e991..d6395da7b 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500)
 TARGET_FLAGS = -mips64r6
 endif
 
+ifeq ($(TARGET), C910V)
+TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
new file mode 100644
index 000000000..15d7b059c
--- /dev/null
+++ b/Makefile.riscv64
@@ -0,0 +1,4 @@
+ifeq ($(CORE), C910V)
+CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
+FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
+endif
diff --git a/Makefile.system b/Makefile.system
index aae7ba503..afc8ee207 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -751,7 +751,10 @@ endif
 endif
 endif
 
-
+ifeq ($(ARCH), riscv64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
 
 
 #
@@ -833,7 +836,9 @@ endif
 ifndef BINARY_DEFINED
 ifneq ($(OSNAME), AIX)
 ifdef BINARY64
+ifneq ($(ARCH), riscv64)
 CCOMMON_OPT += -m64
+endif
 else
 CCOMMON_OPT += -m32
 endif
@@ -954,8 +959,10 @@ endif
 else
 ifdef BINARY64
 ifneq ($(OSNAME), AIX)
+ifneq ($(ARCH), riscv64)
 FCOMMON_OPT += -m64
 endif
+endif
 ifdef INTERFACE64
 ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT +=  -fdefault-integer-8
@@ -1285,10 +1292,14 @@ ifndef SYMBOLSUFFIX
 SYMBOLSUFFIX =
 endif
 
+ifndef LIBSONAMEBASE
+LIBSONAMEBASE = openblas
+endif
+
 ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)
diff --git a/README.md b/README.md
index ca034e747..267df5358 100644
--- a/README.md
+++ b/README.md
@@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Z13**: Optimized Level-3 BLAS and Level-1,2
 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
 
+#### RISC-V
+
+- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
+  ```sh
+  make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/TargetList.txt b/TargetList.txt
index 66eca4506..d19964916 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -104,3 +104,8 @@ VORTEX
 ZARCH_GENERIC
 Z13
 Z14
+
+10.RISC-V 64:
+RISCV64_GENERIC
+C910V
+
diff --git a/c_check b/c_check
index 5ea93b75c..405963ae6 100644
--- a/c_check
+++ b/c_check
@@ -92,6 +92,7 @@ $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
 
 $defined = 0;
 
@@ -136,6 +137,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
+if ($architecture eq "riscv64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index b963940d6..76952152b 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx")
 	endif ()
+	if (HAVE_FMA3)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
+	endif ()
 	if (HAVE_SSE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -msse")
 	endif ()
diff --git a/common.h b/common.h
index a3ef99b59..2825407cb 100644
--- a/common.h
+++ b/common.h
@@ -437,6 +437,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
+    
+#ifdef ARCH_RISCV64
+#include "common_riscv64.h"
+#endif
+
 #ifdef ARCH_MIPS64
 #include "common_mips64.h"
 #endif
diff --git a/common_riscv64.h b/common_riscv64.h
new file mode 100644
index 000000000..27f385dfd
--- /dev/null
+++ b/common_riscv64.h
@@ -0,0 +1,98 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_RISCV64
+#define COMMON_RISCV64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#if defined(C910V)
+#include <riscv-vector.h>
+#endif
+
+#endif
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
new file mode 100644
index 000000000..0eb50e001
--- /dev/null
+++ b/cpuid_riscv64.c
@@ -0,0 +1,113 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define CPU_UNKNOWN     0
+#define CPU_C910V       1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "C910V"
+};
+
+int detect(void){
+    return CPU_UNKNOWN;
+}
+
+char *get_corename(void){
+  return cpuname[detect()];
+}
+
+void get_architecture(void){
+  printf("RISCV64");
+}
+
+void get_subarchitecture(void){
+}
+
+void get_subdirname(void){
+  printf("riscv64");
+}
+
+void get_cpuconfig(void){
+  printf("#define UNKNOWN\n");
+  printf("#define L1_DATA_SIZE 65536\n");
+  printf("#define L1_DATA_LINESIZE 32\n");
+  printf("#define L2_SIZE 512488\n");
+  printf("#define L2_LINESIZE 32\n");
+  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+  printf("#define DTB_SIZE 4096\n");
+  printf("#define L2_ASSOCIATIVE 4\n");
+}
+
+void get_libname(void){
+  printf("riscv64\n");
+}
diff --git a/ctest.c b/ctest.c
index cd84ab1bb..d674a8cbd 100644
--- a/ctest.c
+++ b/ctest.c
@@ -153,6 +153,11 @@ ARCH_ARM
 ARCH_ARM64
 #endif
 
+#if defined(__riscv)
+ARCH_RISCV64
+#endif
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 HAVE_C11
 #endif
+
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index a8b3e9a4b..a576127aa 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #endif
 
-void goto_set_num_threads(int num_threads) {
+static void adjust_thread_buffers() {
 
   int i=0, j=0;
 
+  //adjust buffer for each thread
+  for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
+    for(j=0; j < blas_cpu_number; j++){
+      if(blas_thread_buffer[i][j] == NULL){
+        blas_thread_buffer[i][j] = blas_memory_alloc(2);
+      }
+    }
+    for(; j < MAX_CPU_NUMBER; j++){
+      if(blas_thread_buffer[i][j] != NULL){
+        blas_memory_free(blas_thread_buffer[i][j]);
+        blas_thread_buffer[i][j] = NULL;
+      }
+    }
+  }
+}
+
+void goto_set_num_threads(int num_threads) {
+
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) {
 
   omp_set_num_threads(blas_cpu_number);
 
-  //adjust buffer for each thread
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_cpu_number; j++){
-      if(blas_thread_buffer[i][j]==NULL){
-        blas_thread_buffer[i][j]=blas_memory_alloc(2);
-      }
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      if(blas_thread_buffer[i][j]!=NULL){
-        blas_memory_free(blas_thread_buffer[i][j]);
-        blas_thread_buffer[i][j]=NULL;
-      }
-    }
-  }
+  adjust_thread_buffers();
 #if defined(ARCH_MIPS64)
   //set parameters for different number of threads.
   blas_set_parameter();
@@ -119,20 +124,11 @@ void openblas_set_num_threads(int num_threads) {
 
 int blas_thread_init(void){
 
-  int i=0, j=0;
-
   blas_get_cpu_number();
 
-  blas_server_avail = 1;
+  adjust_thread_buffers();
 
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_num_threads; j++){
-      blas_thread_buffer[i][j]=blas_memory_alloc(2);
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      blas_thread_buffer[i][j]=NULL;
-    }
-  }
+  blas_server_avail = 1;
 
   return 0;
 }
diff --git a/getarch.c b/getarch.c
index daf669e56..8b00aaee7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -981,6 +981,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_GENERIC
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1266,6 +1280,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
+#ifdef FORCE_C910V
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "C910V"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DC910V " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "c910v"
+#define CORENAME  "C910V"
+#else
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1320,6 +1349,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2ba593c2e..893713769 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -25,6 +25,10 @@ ifeq ($(ARCH), arm64)
 USE_TRMM = 1
 endif
 
+ifeq ($(ARCH), riscv64)
+USE_TRMM = 1
+endif
+
 ifeq ($(TARGET), LOONGSON3B)
 USE_TRMM = 1
 endif
diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c
index 63584b95c..a486a1868 100644
--- a/kernel/arm/sum.c
+++ b/kernel/arm/sum.c
@@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	n *= inc_x;
 	if (inc_x == 1)
 	{
-#if V_SIMD
+#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128))
 #ifdef DOUBLE
 		const int vstep = v_nlanes_f64;
-		const int unrollx2 = n & (-vstep * 2);
+		const int unrollx4 = n & (-vstep * 4);
 		const int unrollx = n & -vstep;
 		v_f64 vsum0 = v_zero_f64();
 		v_f64 vsum1 = v_zero_f64();
-		while (i < unrollx2)
-		{
-			vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
-			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
-			i += vstep * 2;
-		}
-		vsum0 = v_add_f64(vsum0, vsum1);
-		while (i < unrollx)
+		v_f64 vsum2 = v_zero_f64();
+		v_f64 vsum3 = v_zero_f64();
+		for (; i < unrollx4; i += vstep * 4)
+		{
+			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
+			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep));
+			vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2));
+			vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3));
+		}
+		vsum0 = v_add_f64(
+			v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3));
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f64(vsum0);
 #else
@@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		v_f32 vsum1 = v_zero_f32();
 		v_f32 vsum2 = v_zero_f32();
 		v_f32 vsum3 = v_zero_f32();
-		while (i < unrollx4)
+		for (; i < unrollx4; i += vstep * 4)
 		{
-			vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
-			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
-			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
-			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
-			i += vstep * 4;
+			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
+			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep));
+			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2));
+			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3));
 		}
 		vsum0 = v_add_f32(
 			v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
-		while (i < unrollx)
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f32(vsum0);
 #endif
diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c
new file mode 100644
index 000000000..7ea4e108c
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x4.c
@@ -0,0 +1,2092 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/4; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x4.c b/kernel/generic/trmmkernel_8x4.c
new file mode 100644
index 000000000..ae88e8d8f
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x4.c
@@ -0,0 +1,1317 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j=0; j<bn/4; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x8.c b/kernel/generic/trmmkernel_8x8.c
new file mode 100644
index 000000000..9add8b9d7
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x8.c
@@ -0,0 +1,2207 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT res4_0;
+    FLOAT res4_1;
+    FLOAT res4_2;
+    FLOAT res4_3;
+    FLOAT res4_4;
+    FLOAT res4_5;
+    FLOAT res4_6;
+    FLOAT res4_7;
+
+    FLOAT res5_0;
+    FLOAT res5_1;
+    FLOAT res5_2;
+    FLOAT res5_3;
+    FLOAT res5_4;
+    FLOAT res5_5;
+    FLOAT res5_6;
+    FLOAT res5_7;
+
+    FLOAT res6_0;
+    FLOAT res6_1;
+    FLOAT res6_2;
+    FLOAT res6_3;
+    FLOAT res6_4;
+    FLOAT res6_5;
+    FLOAT res6_6;
+    FLOAT res6_7;
+
+    FLOAT res7_0;
+    FLOAT res7_1;
+    FLOAT res7_2;
+    FLOAT res7_3;
+    FLOAT res7_4;
+    FLOAT res7_5;
+    FLOAT res7_6;
+    FLOAT res7_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+    
+    FLOAT b4;
+    FLOAT b5;
+    FLOAT b6;
+    FLOAT b7;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j=0; j<bn/8; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+	
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+            res4_4 = 0;
+            res4_5 = 0;
+            res4_6 = 0;
+            res4_7 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+            res5_4 = 0;
+            res5_5 = 0;
+            res5_6 = 0;
+            res5_7 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+            res6_4 = 0;
+            res6_5 = 0;
+            res6_6 = 0;
+            res6_7 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+            res7_4 = 0;
+            res7_5 = 0;
+            res7_6 = 0;
+            res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+                res4_4 += a0*b4;
+                res5_4 += a0*b5;
+                res6_4 += a0*b6;
+                res7_4 += a0*b7;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+                res4_5 += a1*b4;
+                res5_5 += a1*b5;
+                res6_5 += a1*b6;
+                res7_5 += a1*b7;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+                res4_6 += a0*b4;
+                res5_6 += a0*b5;
+                res6_6 += a0*b6;
+                res7_6 += a0*b7;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+                res4_7 += a1*b4;
+                res5_7 += a1*b5;
+                res6_7 += a1*b6;
+                res7_7 += a1*b7;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+            res4_4 *= alpha;
+            res4_5 *= alpha;
+            res4_6 *= alpha;
+            res4_7 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+            res5_4 *= alpha;
+            res5_5 *= alpha;
+            res5_6 *= alpha;
+            res5_7 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+            res6_4 *= alpha;
+            res6_5 *= alpha;
+            res6_6 *= alpha;
+            res6_7 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+            res7_4 *= alpha;
+            res7_5 *= alpha;
+            res7_6 *= alpha;
+            res7_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+            C4[4] = res4_4;
+            C4[5] = res4_5;
+            C4[6] = res4_6;
+            C4[7] = res4_7;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+            C5[4] = res5_4;
+            C5[5] = res5_5;
+            C5[6] = res5_6;
+            C5[7] = res5_7;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+            C6[4] = res6_4;
+            C6[5] = res6_5;
+            C6[6] = res6_6;
+            C6[7] = res6_7;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+            C7[4] = res7_4;
+            C7[5] = res7_5;
+            C7[6] = res7_6;
+            C7[7] = res7_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+            C4 = C4+8;
+            C5 = C5+8;
+            C6 = C6+8;
+            C7 = C7+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+            C4 = C4+4;
+            C5 = C5+4;
+            C6 = C6+4;
+            C7 = C7+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+		res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+		
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+            C4 = C4+2;
+            C5 = C5+2;
+            C6 = C6+2;
+            C7 = C7+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+            res4_0 = 0;
+            res5_0 = 0;
+            res6_0 = 0;
+            res7_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+	    res4_0 *= alpha;
+            res5_0 *= alpha;
+            res6_0 *= alpha;
+            res7_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+            C4[0] = res4_0;
+            C5[0] = res5_0;
+            C6[0] = res6_0;
+            C7[0] = res7_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+            C4 = C4+1;
+            C5 = C5+1;
+            C6 = C6+1;
+            C7 = C7+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+    if (bn&4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
new file mode 100644
index 000000000..68d68b5f8
--- /dev/null
+++ b/kernel/riscv64/KERNEL
@@ -0,0 +1,30 @@
+ifndef SCABS_KERNEL
+SCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL	= ../generic/lsame.c
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
new file mode 100644
index 000000000..0da66fa35
--- /dev/null
+++ b/kernel/riscv64/KERNEL.C910V
@@ -0,0 +1,190 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SAXPBYKERNEL  = axpby_vector.c
+DAXPBYKERNEL  = axpby_vector.c
+CAXPBYKERNEL  = zaxpby_vector.c
+ZAXPBYKERNEL  = zaxpby_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= ../generic/trmmkernel_16x4.c
+DTRMMKERNEL	= ../generic/trmmkernel_8x4.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  sgemm_kernel_16x4_c910v.c
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_8x4_c910v.c
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
new file mode 100644
index 000000000..ea6a8cf21
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -0,0 +1,164 @@
+SAMAXKERNEL  = ../riscv64/amax.c
+DAMAXKERNEL  = ../riscv64/amax.c
+CAMAXKERNEL  = ../riscv64/zamax.c
+ZAMAXKERNEL  = ../riscv64/zamax.c
+
+SAMINKERNEL  = ../riscv64/amin.c
+DAMINKERNEL  = ../riscv64/amin.c
+CAMINKERNEL  = ../riscv64/zamin.c
+ZAMINKERNEL  = ../riscv64/zamin.c
+
+SMAXKERNEL   = ../riscv64/max.c
+DMAXKERNEL   = ../riscv64/max.c
+
+SMINKERNEL   = ../riscv64/min.c
+DMINKERNEL   = ../riscv64/min.c
+
+ISAMAXKERNEL = ../riscv64/iamax.c
+IDAMAXKERNEL = ../riscv64/iamax.c
+ICAMAXKERNEL = ../riscv64/izamax.c
+IZAMAXKERNEL = ../riscv64/izamax.c
+
+ISAMINKERNEL = ../riscv64/iamin.c
+IDAMINKERNEL = ../riscv64/iamin.c
+ICAMINKERNEL = ../riscv64/izamin.c
+IZAMINKERNEL = ../riscv64/izamin.c
+
+ISMAXKERNEL  = ../riscv64/imax.c
+IDMAXKERNEL  = ../riscv64/imax.c
+
+ISMINKERNEL  = ../riscv64/imin.c
+IDMINKERNEL  = ../riscv64/imin.c
+
+SASUMKERNEL  = ../riscv64/asum.c
+DASUMKERNEL  = ../riscv64/asum.c
+CASUMKERNEL  = ../riscv64/zasum.c
+ZASUMKERNEL  = ../riscv64/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../riscv64/axpy.c
+DAXPYKERNEL  = ../riscv64/axpy.c
+CAXPYKERNEL  = ../riscv64/zaxpy.c
+ZAXPYKERNEL  = ../riscv64/zaxpy.c
+
+SCOPYKERNEL  = ../riscv64/copy.c
+DCOPYKERNEL  = ../riscv64/copy.c
+CCOPYKERNEL  = ../riscv64/zcopy.c
+ZCOPYKERNEL  = ../riscv64/zcopy.c
+
+SDOTKERNEL   = ../riscv64/dot.c
+DDOTKERNEL   = ../riscv64/dot.c
+CDOTKERNEL   = ../riscv64/zdot.c
+ZDOTKERNEL   = ../riscv64/zdot.c
+
+SNRM2KERNEL  = ../riscv64/nrm2.c
+DNRM2KERNEL  = ../riscv64/nrm2.c
+CNRM2KERNEL  = ../riscv64/znrm2.c
+ZNRM2KERNEL  = ../riscv64/znrm2.c
+
+SROTKERNEL   = ../riscv64/rot.c
+DROTKERNEL   = ../riscv64/rot.c
+CROTKERNEL   = ../riscv64/zrot.c
+ZROTKERNEL   = ../riscv64/zrot.c
+
+SSCALKERNEL  = ../riscv64/scal.c
+DSCALKERNEL  = ../riscv64/scal.c
+CSCALKERNEL  = ../riscv64/zscal.c
+ZSCALKERNEL  = ../riscv64/zscal.c
+
+SSWAPKERNEL  = ../riscv64/swap.c
+DSWAPKERNEL  = ../riscv64/swap.c
+CSWAPKERNEL  = ../riscv64/zswap.c
+ZSWAPKERNEL  = ../riscv64/zswap.c
+
+SGEMVNKERNEL = ../riscv64/gemv_n.c
+DGEMVNKERNEL = ../riscv64/gemv_n.c
+CGEMVNKERNEL = ../riscv64/zgemv_n.c
+ZGEMVNKERNEL = ../riscv64/zgemv_n.c
+
+SGEMVTKERNEL = ../riscv64/gemv_t.c
+DGEMVTKERNEL = ../riscv64/gemv_t.c
+CGEMVTKERNEL = ../riscv64/zgemv_t.c
+ZGEMVTKERNEL = ../riscv64/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c
new file mode 100644
index 000000000..792e68bd9
--- /dev/null
+++ b/kernel/riscv64/amax.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
new file mode 100644
index 000000000..b6aec131e
--- /dev/null
+++ b/kernel/riscv64/amax_vector.c
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                                ix += inc_xv*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c
new file mode 100644
index 000000000..78495a8e3
--- /dev/null
+++ b/kernel/riscv64/amin.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < minf )
+		{
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
new file mode 100644
index 000000000..53243ad56
--- /dev/null
+++ b/kernel/riscv64/amin_vector.c
@@ -0,0 +1,241 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        MASK_T mask0, mask1;
+	FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                                idx += inc_xv*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/asum.c b/kernel/riscv64/asum.c
new file mode 100644
index 000000000..b284ae3fc
--- /dev/null
+++ b/kernel/riscv64/asum.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += ABS(x[i]);
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
new file mode 100644
index 000000000..7ab7484e8
--- /dev/null
+++ b/kernel/riscv64/asum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
new file mode 100644
index 000000000..278747f75
--- /dev/null
+++ b/kernel/riscv64/axpby.c
@@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
new file mode 100644
index 000000000..432708db7
--- /dev/null
+++ b/kernel/riscv64/axpby_vector.c
@@ -0,0 +1,378 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	if (n < 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+
+        if(beta == 0.0){
+                if(alpha == 0.0){//alpha == 0 && beta == 0
+                        if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy0, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+
+                }else{//alpha != 0 && beta == 0, y = ax
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x !=1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+		}
+        }else{//beta != 0
+		if(alpha == 0.0){//alpha == 0 && beta != 0; y = by
+			if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+			}
+
+		}else{//alpha != 0 && beta != 0; y = ax + by
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1,gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x != 1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
new file mode 100644
index 000000000..fb1094dd9
--- /dev/null
+++ b/kernel/riscv64/axpy.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+	if ( da == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
new file mode 100644
index 000000000..5a7ba4b08
--- /dev/null
+++ b/kernel/riscv64/axpy_vector.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0, j=0, jx=0, jy=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+	FLOAT_V_T vy0, vy1;
+	BLASLONG stride_x, stride_y;
+
+	if (n < 0)  return(0);
+	if (da == 0.0) return(0);
+
+	if (inc_x == 1 && inc_y == 1) {
+
+		gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+
+		if (gvl <= n/2) {
+			for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) {
+				vx0 = VLEV_FLOAT(&x[j], gvl);
+				vy0 = VLEV_FLOAT(&y[j], gvl);
+				vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+				VSEV_FLOAT(&y[j], vy0, gvl);
+
+				vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+				vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+				vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+				VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+			}
+		}
+		//tail
+		for (; j < n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+
+			j += gvl;
+		}
+	}else if (inc_y == 1) {
+		stride_x = inc_x * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+			j += gvl;
+		}
+        }else if(inc_x == 1){
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}else{
+		stride_x = inc_x * sizeof(FLOAT);
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
new file mode 100644
index 000000000..7b4f04f30
--- /dev/null
+++ b/kernel/riscv64/copy.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		y[iy] = x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
new file mode 100644
index 000000000..ce44a20f3
--- /dev/null
+++ b/kernel/riscv64/copy_vector.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+	if(n < 0)  return(0);
+
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T v0, v1, v2, v3;
+        unsigned int gvl = 0;
+
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n*sizeof(FLOAT));
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG gvl3 = gvl * 3;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSEV_FLOAT(&y[j], v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl], v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl*2], v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl3], v3, gvl);
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSEV_FLOAT(&y[j], v0, gvl);
+                        j += gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        BLASLONG gvl3 = gvl * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLEV_FLOAT(&x[j+gvl*2], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLEV_FLOAT(&x[j+gvl3], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+                                j += gvl * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
new file mode 100644
index 000000000..bcbf942c0
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -0,0 +1,977 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*8  \n\t"\
+	"addi       t2,    %[PB], 2*8  \n\t"\
+	"addi       t3,    %[PB], 3*8  \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 2*8  \n\t"\
+	"addi       t5,    %[PA], 4*8  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 6*8  \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*8  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"fld        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e64,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 2*8  \n\t"
+				"addi       t5,    %[PA], 4*8  \n\t"
+				"addi       t6,    %[PA], 6*8  \n\t"
+				"addi       t1,    %[PB], 1*8  \n\t"
+				"addi       t2,    %[PB], 2*8  \n\t"
+				"addi       t3,    %[PB], 3*8  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"fld        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*8  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*8  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*8  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"fld        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*8  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    8*8  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    8*8  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"fld        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,     4*8  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"fld        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,     4*8  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 2*8   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 2*8   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 2*8   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 2*8   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     4*8  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     4*8  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     4*8  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     4*8  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
new file mode 100644
index 000000000..46a84ad18
--- /dev/null
+++ b/kernel/riscv64/dot.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
new file mode 100644
index 000000000..8b1ae278c
--- /dev/null
+++ b/kernel/riscv64/dot_vector.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#endif
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0, j=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+        FLOAT_V_T vr, vx, vy;
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }
+	return(dot);
+}
+
+
diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c
new file mode 100644
index 000000000..ef61b245b
--- /dev/null
+++ b/kernel/riscv64/gemv_n.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	ix = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = alpha * x[ix];
+		iy = 0;
+		for (i=0; i<m; i++)
+		{
+			y[iy] += temp * a_ptr[i];
+			iy += inc_y;
+		}
+		a_ptr += lda;
+		ix    += inc_x;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
new file mode 100644
index 000000000..bd4d23eae
--- /dev/null
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+
+	if(n < 0)  return(0);
+        FLOAT *a_ptr = a;
+        FLOAT temp = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        if(inc_y == 1){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSEV_FLOAT(&y[j], vy0, gvl);
+                        j += gvl;
+                }
+        }else{
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_t.c b/kernel/riscv64/gemv_t.c
new file mode 100644
index 000000000..169047b72
--- /dev/null
+++ b/kernel/riscv64/gemv_t.c
@@ -0,0 +1,68 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	iy = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = 0.0;
+		ix = 0;
+		for (i=0; i<m; i++)
+		{
+			temp += a_ptr[i] * x[ix];
+			ix    += inc_x;
+		}
+		y[iy] += alpha * temp;
+		iy += inc_y;
+		a_ptr += lda;
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
new file mode 100644
index 000000000..beca8dc0f
--- /dev/null
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp;
+
+        FLOAT_V_T va, vr, vx;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }else{
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        ix = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                                ix += inc_xv;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/iamax.c b/kernel/riscv64/iamax.c
new file mode 100644
index 000000000..8c016ce4d
--- /dev/null
+++ b/kernel/riscv64/iamax.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			max = i;
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
new file mode 100644
index 000000000..3aa64afc9
--- /dev/null
+++ b/kernel/riscv64/iamax_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c
new file mode 100644
index 000000000..155292bd5
--- /dev/null
+++ b/kernel/riscv64/iamin.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) )
+		{
+			min = i;
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
new file mode 100644
index 000000000..608f19a00
--- /dev/null
+++ b/kernel/riscv64/iamin_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c
new file mode 100644
index 000000000..5072dd16e
--- /dev/null
+++ b/kernel/riscv64/imax.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			max = i;
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
new file mode 100644
index 000000000..44af7101b
--- /dev/null
+++ b/kernel/riscv64/imax_vector.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+	FLOAT maxf=-FLT_MAX;
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                v_max_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c
new file mode 100644
index 000000000..598cba387
--- /dev/null
+++ b/kernel/riscv64/imin.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+*	 BLASTEST float
+* 	 BLASTEST double
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > minf )
+		{
+			min = i;
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
new file mode 100644
index 000000000..e6e0e9f9f
--- /dev/null
+++ b/kernel/riscv64/imin_vector.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c
new file mode 100644
index 000000000..8fe33e95b
--- /dev/null
+++ b/kernel/riscv64/izamax.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG max=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			max = i;
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
new file mode 100644
index 000000000..62c95d973
--- /dev/null
+++ b/kernel/riscv64/izamax_vector.c
@@ -0,0 +1,246 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx0, vx1, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max_index = VMVVX_UINT(0, gvl);
+        v_max = VFMVVF_FLOAT(-1, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element greater than v_max
+                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
+                v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl);
+
+                //update v_max and start_index j
+                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+        maxf = vx0[0];
+        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        max_index = VMFIRSTM(mask0,gvl);
+        max_index = v_max_index[max_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+                FLOAT cur_maxf = vx0[0];
+                if(cur_maxf > maxf){
+                        //tail index
+                        v_max_index = VIDV_UINT(gvl);
+                        v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        max_index = VMFIRSTM(mask0,gvl);
+                        max_index = v_max_index[max_index];
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c
new file mode 100644
index 000000000..fb5a0d4cb
--- /dev/null
+++ b/kernel/riscv64/izamin.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG min=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			min = i;
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
new file mode 100644
index 000000000..38eccf1b5
--- /dev/null
+++ b/kernel/riscv64/izamin_vector.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx0, vx1, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min_index = VMVVX_UINT(0, gvl);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element less than v_min
+                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
+                v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl);
+
+                //update v_min and start_index j
+                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+        minf = vx0[0];
+        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
+        min_index = VMFIRSTM(mask0,gvl);
+        min_index = v_min_index[min_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_min_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+                FLOAT cur_minf = vx0[0];
+                if(cur_minf < minf){
+                        //tail index
+                        v_min_index = VIDV_UINT(gvl);
+                        v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        min_index = VMFIRSTM(mask0,gvl);
+                        min_index = v_min_index[min_index];
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c
new file mode 100644
index 000000000..2ad956bc0
--- /dev/null
+++ b/kernel/riscv64/max.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
new file mode 100644
index 000000000..4ef75452d
--- /dev/null
+++ b/kernel/riscv64/max_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT maxf=-FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c
new file mode 100644
index 000000000..2812fe397
--- /dev/null
+++ b/kernel/riscv64/min.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] < minf )
+		{
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
new file mode 100644
index 000000000..83c965bfa
--- /dev/null
+++ b/kernel/riscv64/min_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
new file mode 100644
index 000000000..fcff09337
--- /dev/null
+++ b/kernel/riscv64/nrm2.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	FLOAT absxi = 0.0;
+
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	if ( n == 1 ) return( ABS(x[0]) );
+
+	n *= inc_x;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			absxi = ABS( x[i] );
+			if ( scale < absxi )
+			{
+				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+				scale = absxi ;
+			}
+			else
+			{
+				ssq += ( absxi/scale ) * ( absxi/scale );
+			}
+
+		}
+		i += inc_x;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
new file mode 100644
index 000000000..785c0d2f8
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector.c
@@ -0,0 +1,220 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                int idx = 0, inc_v = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
new file mode 100644
index 000000000..a3d15406c
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -0,0 +1,128 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUM_FLOAT vfredsumvs_float32xm8
+#define VFMACCVV_FLOAT vfmaccvv_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFDOTVV_FLOAT vfdotvv_float32xm8
+#define ABS fabsf
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUM_FLOAT vfredsumvs_float64xm8
+#define VFMACCVV_FLOAT vfmaccvv_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFDOTVV_FLOAT vfdotvv_float64xm8
+#define ABS fabs
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	double len = 0.0 ;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }
+	return(sqrt(len));
+}
+
+
diff --git a/kernel/riscv64/omatcopy_cn.c b/kernel/riscv64/omatcopy_cn.c
new file mode 100644
index 000000000..4d11b9125
--- /dev/null
+++ b/kernel/riscv64/omatcopy_cn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		for(j=0; j<rows; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_ct.c b/kernel/riscv64/omatcopy_ct.c
new file mode 100644
index 000000000..b2587813f
--- /dev/null
+++ b/kernel/riscv64/omatcopy_ct.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = 0.0;
+			}
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = aptr[j];
+			}
+			aptr += lda;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<rows; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rn.c b/kernel/riscv64/omatcopy_rn.c
new file mode 100644
index 000000000..57515e729
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<rows ; i++ )
+	{
+		for(j=0; j<cols; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rt.c b/kernel/riscv64/omatcopy_rt.c
new file mode 100644
index 000000000..9d58350d5
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rt.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<cols; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot.c b/kernel/riscv64/rot.c
new file mode 100644
index 000000000..18b4ca252
--- /dev/null
+++ b/kernel/riscv64/rot.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	while(i < n)
+	{
+		temp   = c*x[ix] + s*y[iy] ;
+		y[iy]  = c*y[iy] - s*x[ix] ;
+		x[ix]  = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
new file mode 100644
index 000000000..aeabca1ba
--- /dev/null
+++ b/kernel/riscv64/rot_vector.c
@@ -0,0 +1,196 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+
+	if(n <= 0)  return(0);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, vx, vy;
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/scal.c b/kernel/riscv64/scal.c
new file mode 100644
index 000000000..4ef49e293
--- /dev/null
+++ b/kernel/riscv64/scal.c
@@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+	
+
+	while(j < n)
+	{
+
+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
new file mode 100644
index 000000000..5152eea06
--- /dev/null
+++ b/kernel/riscv64/scal_vector.c
@@ -0,0 +1,133 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#define VFMULVF_FLOAT vfmulvf_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#define VFMULVF_FLOAT vfmulvf_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+
+        FLOAT_V_T v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                if(da == 0.0){
+                        memset(&x[0], 0, n * sizeof(FLOAT));
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLEV_FLOAT(&x[j], gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                                        v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v1, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }
+        }else{
+                if(da == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v0, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
+                        if(gvl < n / 2){
+                                BLASLONG inc_xv = gvl * inc_x;
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                                        v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, v1, gvl);
+                                        ix += inc_xv * 2;
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                j += gvl;
+                                ix += inc_x * gvl;
+                        }
+                }
+        }
+	return 0;
+}
+
+
diff --git a/kernel/riscv64/sgemm_kernel_16x4_c910v.c b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
new file mode 100644
index 000000000..83507e744
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
@@ -0,0 +1,1575 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL16x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL16x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL16x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL16x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"
+
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"
+
+
+#define KERNEL16x2_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+	
+
+#define KERNEL16x2_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+
+
+#define KERNEL16x2_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+        "addi       %[PB], %[PB], 2*4  \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"
+
+
+#define KERNEL16x2_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x4_I
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x4_MAINLOOP:                \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_MAINLOOP \n\t"
+				
+				"M16x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_E
+				
+				//tail
+				"M16x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     8*4  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     8*4  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M16x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&8){
+   		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+								
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+				
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v20", "v21", 
+				 "v24", "v25", "v28", "v29");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+   		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x2_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x2_I
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x2_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x2_MAINLOOP:                \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_MAINLOOP \n\t"
+				
+				"M16x2_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_E
+				
+				//tail
+				"M16x2_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x2_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x2_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 2*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     2*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmv.v.f   v9,   ft1          \n\t"				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x2_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"M16x2_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+
+	   }
+	   if(bm&8){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+	 	   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   res8 = res8 + ptrba[8] * loadb0;
+			   res9 = res9 + ptrba[9] * loadb0;
+			   res10 = res10 + ptrba[10] * loadb0;
+			   res11 = res11 + ptrba[11] * loadb0;
+
+			   res12 = res12 + ptrba[12] * loadb0;
+			   res13 = res13 + ptrba[13] * loadb0;
+			   res14 = res14 + ptrba[14] * loadb0;
+			   res15 = res15 + ptrba[15] * loadb0;
+			   
+			   ptrba += 16;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+		   
+   		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0[8] += res8;
+		   C0[9] += res9;
+		   C0[10] += res10;
+		   C0[11] += res11;
+		   C0[12] += res12;
+		   C0[13] += res13;
+		   C0[14] += res14;
+		   C0[15] += res15;
+		   
+		   C0 += 16;
+
+	   }
+	   
+	   if(bm&8){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
new file mode 100644
index 000000000..eac621fb2
--- /dev/null
+++ b/kernel/riscv64/swap.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/08/20 Saar
+*	 BLASTEST float		OK
+* 	 BLASTEST double	OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		temp  = x[ix] ;
+		x[ix] = y[iy] ;
+		y[iy] = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
new file mode 100644
index 000000000..9377bf4b9
--- /dev/null
+++ b/kernel/riscv64/swap_vector.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j+=gvl * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j+=gvl;
+                }
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        iy += inc_y * gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                        iy += inc_y * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L.c b/kernel/riscv64/symv_L.c
new file mode 100644
index 000000000..8f48d03f5
--- /dev/null
+++ b/kernel/riscv64/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if ( m != offset )
+		printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+	jx = 0;
+	jy = 0;
+
+	for (j=0; j<offset; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		y[jy] += temp1 * a[j*lda+j];
+		iy = jy;
+		ix = jx;
+		for (i=j+1; i<m; i++)
+		{
+			ix += inc_x;
+			iy += inc_y;
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			
+		}
+		y[jy] += alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
new file mode 100644
index 000000000..3c2647026
--- /dev/null
+++ b/kernel/riscv64/symv_L_vector.c
@@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        if(inc_x == 1 && inc_y == 1){
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }else if(inc_y == 1){
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                inc_xv = inc_x * gvl;
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }else{
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c
new file mode 100644
index 000000000..b5a0c96e9
--- /dev/null
+++ b/kernel/riscv64/symv_U.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if( m != offset )
+		printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+	BLASLONG m1 = m - offset;
+
+	jx = m1 * inc_x;
+	jy = m1 * inc_y;
+
+	for (j=m1; j<m; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		iy = 0;
+		ix = 0;
+		for (i=0; i<j; i++)
+		{
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			ix += inc_x;
+			iy += inc_y;
+			
+		}
+		y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
new file mode 100644
index 000000000..29e0e4b65
--- /dev/null
+++ b/kernel/riscv64/symv_U_vector.c
@@ -0,0 +1,264 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1){
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }else if(inc_y == 1){
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }else{
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c
new file mode 100644
index 000000000..a39bd7821
--- /dev/null
+++ b/kernel/riscv64/zamax.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
new file mode 100644
index 000000000..a6c742b14
--- /dev/null
+++ b/kernel/riscv64/zamax_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max = VFMVVF_FLOAT(0, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(0, gvl);
+        v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+        maxf = v_max[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(0, gvl);
+                v_max = VFREDMAXVS_FLOAT(v1, v0, gvl);
+                if(v_max[0] > maxf)
+                        maxf = v_max[0];
+        }
+        return(maxf);
+}
diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c
new file mode 100644
index 000000000..02eab3e75
--- /dev/null
+++ b/kernel/riscv64/zamin.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
new file mode 100644
index 000000000..44a7cf1dc
--- /dev/null
+++ b/kernel/riscv64/zamin_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        v_min = VFREDMINVS_FLOAT(v_min, v0, gvl);
+        minf = v_min[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min = VFREDMINVS_FLOAT(v1, v0, gvl);
+                if(v_min[0] < minf)
+                        minf = v_min[0];
+        }
+        return(minf);
+}
diff --git a/kernel/riscv64/zasum.c b/kernel/riscv64/zasum.c
new file mode 100644
index 000000000..61e85cae6
--- /dev/null
+++ b/kernel/riscv64/zasum.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CABS1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
new file mode 100644
index 000000000..d9fa88971
--- /dev/null
+++ b/kernel/riscv64/zasum_vector.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                asumf += v0[0];
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v_sum[0];
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zaxpby.c b/kernel/riscv64/zaxpby.c
new file mode 100644
index 000000000..445354416
--- /dev/null
+++ b/kernel/riscv64/zaxpby.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
new file mode 100644
index 000000000..1897ce417
--- /dev/null
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
+{
+	if (n <= 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+        stride_x = inc_x * 2 * sizeof(FLOAT);
+        stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        if(beta_r == 0.0 && beta_i == 0.0){
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        if(inc_y == 1){
+                                memset(&y[0], 0, 2 * n * sizeof(FLOAT));
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl * 2;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                        VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                        j += gvl;
+                                        iy += inc_y * gvl * 2;
+                                }
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                        }
+                }
+        }else{
+	        FLOAT_V_T v0, v1;
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0;i<n/gvl;i++){
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                                j += gvl;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
new file mode 100644
index 000000000..1dcaeac27
--- /dev/null
+++ b/kernel/riscv64/zaxpy.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/15 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
new file mode 100644
index 000000000..fb2656a1d
--- /dev/null
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+	if(da_r == 0.0 && da_i == 0.0) return(0);
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
new file mode 100644
index 000000000..07fe584c5
--- /dev/null
+++ b/kernel/riscv64/zcopy.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		y[iy]   = x[ix] ;
+		y[iy+1] = x[ix+1] ;
+		ix += inc_x2;
+		iy += inc_y2;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
new file mode 100644
index 000000000..6ed430931
--- /dev/null
+++ b/kernel/riscv64/zcopy_vector.c
@@ -0,0 +1,92 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT));
+        }else{
+                FLOAT_V_T vx0, vx1, vx2, vx3;
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i < n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                                vx2 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vx3 = VLSEV_FLOAT(&x[ix+1+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx2, gvl);
+                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vx3, gvl);
+
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_x * 2 * gvl;
+                        iy += inc_y * 2 * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zdot.c b/kernel/riscv64/zdot.c
new file mode 100644
index 000000000..733c235c6
--- /dev/null
+++ b/kernel/riscv64/zdot.c
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: FAIL
+* 	 BLASTEST double	: FAIL
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT dot[2];
+	OPENBLAS_COMPLEX_FLOAT result;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	dot[0]=0.0;
+	dot[1]=0.0;
+
+	CREAL(result) = 0.0 ;
+	CIMAG(result) = 0.0 ;
+
+	if ( n < 1 )  return(result);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
+		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
+#else
+		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
+		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
+#endif
+		ix  += inc_x2 ;
+		iy  += inc_y2 ;
+		i++ ;
+
+	}
+	CREAL(result) = dot[0];
+	CIMAG(result) = dot[1];
+	return(result);
+
+}
+
+
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
new file mode 100644
index 000000000..33efd07e7
--- /dev/null
+++ b/kernel/riscv64/zdot_vector.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFMSACVV_FLOAT vfmsacvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFMSACVV_FLOAT vfmsacvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT dot[2];
+        OPENBLAS_COMPLEX_FLOAT result;
+
+        dot[0]=0.0;
+        dot[1]=0.0;
+
+        CREAL(result) = 0.0;
+        CIMAG(result) = 0.0;
+
+        if ( n < 1 )  return(result);
+
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        vr0 = VFMVVF_FLOAT(0, gvl);
+        vr1 = VFMVVF_FLOAT(0, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+        for(i=0,j=0; i<n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+                vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, gvl);
+#if !defined(CONJ)
+                vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, gvl);
+#endif
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+        dot[0] += vr0[0];
+        vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+        dot[1] += vr1[0];
+        //tail
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+#if !defined(CONJ)
+                vr0 = VFMULVV_FLOAT(vx1, vy1, gvl);
+                vr0 = VFMSACVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMULVV_FLOAT(vx0, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMULVV_FLOAT(vx0, vy0, gvl);
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
+                vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
+#endif
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+                dot[0] += vr0[0];
+                vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+                dot[1] += vr1[0];
+        }
+        CREAL(result) = dot[0];
+        CIMAG(result) = dot[1];
+        return(result);
+}
diff --git a/kernel/riscv64/zgemv_n.c b/kernel/riscv64/zgemv_n.c
new file mode 100644
index 000000000..b9b03f792
--- /dev/null
+++ b/kernel/riscv64/zgemv_n.c
@@ -0,0 +1,157 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	ix = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += 2;
+		}
+		a_ptr += lda2;
+		ix    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += inc_y2;
+		}
+		a_ptr += lda2;
+		ix    += inc_x2;
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
new file mode 100644
index 000000000..31cbbe6bb
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+        FLOAT *a_ptr = a;
+        FLOAT temp_r = 0.0, temp_i = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_yv = inc_y * gvl * 2;
+        BLASLONG inc_x2 = inc_x * 2;
+        BLASLONG lda2 = lda * 2;
+        for(k=0,j=0; k<m/gvl; k++){
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl * 2;
+                iy += inc_yv;
+        }
+        //tail
+        if(j/2 < m){
+                gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_t.c b/kernel/riscv64/zgemv_t.c
new file mode 100644
index 000000000..1239cf3f7
--- /dev/null
+++ b/kernel/riscv64/zgemv_t.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	iy = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += 2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += inc_x2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += inc_y2;
+	}
+
+	return(0);
+
+}
+
+
+
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
new file mode 100644
index 000000000..b23a4d8a3
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp_r, temp_i;
+
+        FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        BLASLONG inc_av = gvl * 2;
+        BLASLONG inc_y2 = inc_y * 2;
+        BLASLONG lda2 = lda * 2;
+        for(i = 0; i < n; i++){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                j = 0;
+                ix = 0;
+                vr = VFMVVF_FLOAT(0, gvl);
+                vi = VFMVVF_FLOAT(0, gvl);
+                for(k = 0; k < m/gvl; k++){
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        j += inc_av;
+                        ix += inc_xv;
+                }
+                va0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                temp_r = vx0[0];
+                vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                temp_i = vx1[0];
+                if(j/2 < m){
+                        gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                        temp_r += vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                        temp_i += vx1[0];
+                }
+#if !defined(XCONJ)
+                y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+                y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+                y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+                y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+                iy += inc_y2;
+                a_ptr += lda2;
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
new file mode 100644
index 000000000..aa9ac85d5
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < m){
+				gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
new file mode 100644
index 000000000..6fe12c76c
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < j){
+				gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
new file mode 100644
index 000000000..fc1c8b54a
--- /dev/null
+++ b/kernel/riscv64/znrm2.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	BLASLONG inc_x2;
+	FLOAT temp;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			temp = ABS( x[i] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+		if ( x[i+1] != 0.0 )
+		{
+			temp = ABS( x[i+1] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+
+		i += inc_x2;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
new file mode 100644
index 000000000..b0ebfa5f4
--- /dev/null
+++ b/kernel/riscv64/znrm2_vector.c
@@ -0,0 +1,278 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+//        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n2/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n2){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                int idx = 0, inc_v = inc_x * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cn.c b/kernel/riscv64/zomatcopy_cn.c
new file mode 100644
index 000000000..f5a7a6284
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia+=2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cnc.c b/kernel/riscv64/zomatcopy_cnc.c
new file mode 100644
index 000000000..210c3f716
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ct.c b/kernel/riscv64/zomatcopy_ct.c
new file mode 100644
index 000000000..38bc9b9f7
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ct.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ctc.c b/kernel/riscv64/zomatcopy_ctc.c
new file mode 100644
index 000000000..34e7e919a
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ctc.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rn.c b/kernel/riscv64/zomatcopy_rn.c
new file mode 100644
index 000000000..ded381e15
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]    - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rnc.c b/kernel/riscv64/zomatcopy_rnc.c
new file mode 100644
index 000000000..fc27f17ec
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans , conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]    + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rt.c b/kernel/riscv64/zomatcopy_rt.c
new file mode 100644
index 000000000..d34db24e0
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rt.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rtc.c b/kernel/riscv64/zomatcopy_rtc.c
new file mode 100644
index 000000000..a80ee6dfe
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rtc.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot.c b/kernel/riscv64/zrot.c
new file mode 100644
index 000000000..98be68db8
--- /dev/null
+++ b/kernel/riscv64/zrot.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+		temp[0]   = c*x[ix]   + s*y[iy] ;
+		temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+		y[iy]     = c*y[iy]   - s*x[ix] ;
+		y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+		x[ix]     = temp[0] ;
+		x[ix+1]   = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
new file mode 100644
index 000000000..a3fdda45a
--- /dev/null
+++ b/kernel/riscv64/zrot_vector.c
@@ -0,0 +1,162 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+
+        if (n < 1)  return(0);
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+	if(inc_x==1 && inc_y==1){
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+
+			j += gvl;
+			ix += 2*gvl;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+						vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+		}
+		
+	}else{
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+			j += gvl;
+			ix += inc_xv;
+			iy += inc_yv;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+		}
+	}
+        return(0);
+}
diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
new file mode 100644
index 000000000..0521aaa0b
--- /dev/null
+++ b/kernel/riscv64/zscal.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG inc_x2;
+	BLASLONG ip = 0;
+	FLOAT temp;
+
+        if ( (n <= 0) || (inc_x <= 0))
+                return(0);
+
+
+	inc_x2 = 2 * inc_x;
+	for ( i=0; i<n; i++ )
+	{
+		if ( da_r == 0.0 )
+		{
+			if ( da_i == 0.0 )
+			{
+				temp = 0.0;
+				x[ip+1] = 0.0 ;
+			}
+			else
+			{
+				temp = - da_i * x[ip+1] ;
+				x[ip+1] = da_i * x[ip]  ;
+			}
+		}
+		else
+		{
+			if ( da_i == 0.0 )
+			{
+				temp    = da_r * x[ip]  ;
+				x[ip+1] = da_r * x[ip+1];
+			}
+			else
+			{
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+			}
+		}
+		x[ip]   = temp;
+
+		ip += inc_x2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
new file mode 100644
index 000000000..796c52a02
--- /dev/null
+++ b/kernel/riscv64/zscal_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+
+
+        if((n <= 0) || (inc_x <= 0))
+                return(0);
+
+        unsigned int gvl = 0;
+        FLOAT_V_T vt, v0, v1;
+        if(da_r == 0.0 && da_i == 0.0){
+                memset(&x[0], 0, n * 2 * sizeof(FLOAT));
+        }else if(da_r == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else if(da_i == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
new file mode 100644
index 000000000..ae4760ae0
--- /dev/null
+++ b/kernel/riscv64/zswap.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		temp[0]  = x[ix]   ;
+		temp[1]  = x[ix+1] ;
+		x[ix]    = y[iy]   ;
+		x[ix+1]  = y[iy+1] ;
+		y[iy]    = temp[0] ;
+		y[iy+1]  = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
new file mode 100644
index 000000000..b655a968c
--- /dev/null
+++ b/kernel/riscv64/zswap_vector.c
@@ -0,0 +1,117 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG n2 = n * 2;
+                if(gvl <= n2/2){
+                        for(i=0,j=0; i<n2/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * 2 * sizeof(FLOAT);
+                stride_y = inc_y * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl * 2;
+                BLASLONG inc_yv = inc_y * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h
index ef8fcb865..3802a91e1 100644
--- a/kernel/simd/intrin.h
+++ b/kernel/simd/intrin.h
@@ -47,7 +47,7 @@ extern "C" {
 #endif
 
 /** AVX **/
-#ifdef HAVE_AVX
+#if defined(HAVE_AVX) || defined(HAVE_FMA3)
 #include <immintrin.h>
 #endif
 
diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h
index 3f79646e0..fbe531417 100644
--- a/kernel/simd/intrin_avx.h
+++ b/kernel/simd/intrin_avx.h
@@ -12,6 +12,8 @@ typedef __m256d  v_f64;
  ***************************/
 #define v_add_f32 _mm256_add_ps
 #define v_add_f64 _mm256_add_pd
+#define v_sub_f32 _mm256_sub_ps
+#define v_sub_f64 _mm256_sub_pd
 #define v_mul_f32 _mm256_mul_ps
 #define v_mul_f64 _mm256_mul_pd
 
@@ -19,12 +21,20 @@ typedef __m256d  v_f64;
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm256_fmadd_ps
     #define v_muladd_f64 _mm256_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm256_fmsub_ps
+    #define v_mulsub_f64 _mm256_fmsub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // !HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h
index f00af53e9..8f38eedd9 100644
--- a/kernel/simd/intrin_avx512.h
+++ b/kernel/simd/intrin_avx512.h
@@ -12,11 +12,16 @@ typedef __m512d  v_f64;
  ***************************/
 #define v_add_f32 _mm512_add_ps
 #define v_add_f64 _mm512_add_pd
+#define v_sub_f32 _mm512_sub_ps
+#define v_sub_f64 _mm512_sub_pd
 #define v_mul_f32 _mm512_mul_ps
 #define v_mul_f64 _mm512_mul_pd
 // multiply and add, a*b + c
 #define v_muladd_f32 _mm512_fmadd_ps
 #define v_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define v_mulsub_f32 _mm512_fmsub_ps
+#define v_mulsub_f64 _mm512_fmsub_pd
 BLAS_FINLINE float v_sum_f32(v_f32 a)
 {
     __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h
index 22cef10ca..cd44599fe 100644
--- a/kernel/simd/intrin_neon.h
+++ b/kernel/simd/intrin_neon.h
@@ -18,6 +18,8 @@ typedef float32x4_t v_f32;
  ***************************/
 #define v_add_f32 vaddq_f32
 #define v_add_f64 vaddq_f64
+#define v_sub_f32 vsubq_f32
+#define v_sub_f64 vsubq_f64
 #define v_mul_f32 vmulq_f32
 #define v_mul_f64 vmulq_f64
 
@@ -26,16 +28,24 @@ typedef float32x4_t v_f32;
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vfmaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vfmaq_f32(vnegq_f32(c), a, b); }
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vmlaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vmlaq_f32(vnegq_f32(c), a, b); }
 #endif
 
 // FUSED F64
 #if V_SIMD_F64
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return vfmaq_f64(c, a, b); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return vfmaq_f64(vnegq_f64(c), a, b); }
 #endif
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h
index 06a3fe78b..6a542072e 100644
--- a/kernel/simd/intrin_sse.h
+++ b/kernel/simd/intrin_sse.h
@@ -12,22 +12,35 @@ typedef __m128d  v_f64;
  ***************************/
 #define v_add_f32 _mm_add_ps
 #define v_add_f64 _mm_add_pd
+#define v_sub_f32 _mm_sub_ps
+#define v_sub_f64 _mm_sub_pd
 #define v_mul_f32 _mm_mul_ps
 #define v_mul_f64 _mm_mul_pd
 #ifdef HAVE_FMA3
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_fmadd_ps
     #define v_muladd_f64 _mm_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_fmsub_ps
+    #define v_mulsub_f64 _mm_fmsub_pd
 #elif defined(HAVE_FMA4)
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_macc_ps
     #define v_muladd_f64 _mm_macc_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_msub_ps
+    #define v_mulsub_f64 _mm_msub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index 8a40ea4b9..ddec21383 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 }
 
 #endif
-
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = dasum_kernel(n, x);
-    } 
+    }
     else {
         n *= inc_x;
-       
-        while(i < n) {
+        while (i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
@@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     return(sumf);
 }
 
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+    FLOAT * dummy_b;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else 
+	    nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    } else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
+
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index a312b7ff9..66e9ff907 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -7,10 +7,76 @@
 #endif
 
 #ifndef HAVE_DROT_KERNEL
+#include "../simd/intrin.h"
 
 static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
+#if V_SIMD_F64 && V_SIMD > 256
+    const int vstep = v_nlanes_f64;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f64 __c = v_setall_f64(c);
+    v_f64 __s = v_setall_f64(s);
+    v_f64 vx0, vx1, vx2, vx3;
+    v_f64 vy0, vy1, vy2, vy3;
+    v_f64 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f64(x + i);
+        vx1 = v_loadu_f64(x + i + vstep);
+        vx2 = v_loadu_f64(x + i + vstep * 2);
+        vx3 = v_loadu_f64(x + i + vstep * 3);
+        vy0 = v_loadu_f64(y + i);
+        vy1 = v_loadu_f64(y + i + vstep);
+        vy2 = v_loadu_f64(y + i + vstep * 2);
+        vy3 = v_loadu_f64(y + i + vstep * 3);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt1 = v_mul_f64(__s, vy1);
+        vt2 = v_mul_f64(__s, vy2);
+        vt3 = v_mul_f64(__s, vy3);
+
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        vt1 = v_muladd_f64(__c, vx1, vt1);
+        vt2 = v_muladd_f64(__c, vx2, vt2);
+        vt3 = v_muladd_f64(__c, vx3, vt3);
+
+        v_storeu_f64(x + i, vt0);
+        v_storeu_f64(x + i + vstep, vt1);
+        v_storeu_f64(x + i + vstep * 2, vt2);
+        v_storeu_f64(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt1 = v_mul_f64(__s, vx1);
+        vt2 = v_mul_f64(__s, vx2);
+        vt3 = v_mul_f64(__s, vx3);
+
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        vt1 = v_mulsub_f64(__c, vy1, vt1);
+        vt2 = v_mulsub_f64(__c, vy2, vt2);
+        vt3 = v_mulsub_f64(__c, vy3, vt3);
+
+        v_storeu_f64(y + i, vt0);
+        v_storeu_f64(y + i + vstep, vt1);
+        v_storeu_f64(y + i + vstep * 2, vt2);
+        v_storeu_f64(y + i + vstep * 3, vt3);
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f64(x + i);
+        vy0 = v_loadu_f64(y + i);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        v_storeu_f64(x + i, vt0);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        v_storeu_f64(y + i, vt0);
+    }
+#else
     FLOAT f0, f1, f2, f3;
     FLOAT x0, x1, x2, x3;
     FLOAT g0, g1, g2, g3;
@@ -53,7 +119,7 @@ static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         yp += 4;
         i += 4;
     }
-
+#endif
     while (i < n) {
         FLOAT temp = c*x[i] + s*y[i];
         y[i] = c*y[i] - s*x[i];
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index 36ec4a737..d0cea9bee 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
 #endif
 
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = sasum_kernel(n, x);
     }
     else {
-
         n *= inc_x;
         while(i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
-
     }
+    return (sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT * ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
     return(sumf);
 }
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index 021c20d82..3de586cb8 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -7,10 +7,78 @@
 #endif
 
 #ifndef HAVE_SROT_KERNEL
+#include"../simd/intrin.h"
 
 static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
+    
+#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
+    const int vstep = v_nlanes_f32;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f32 __c = v_setall_f32(c);
+    v_f32 __s = v_setall_f32(s);
+    v_f32 vx0, vx1, vx2, vx3;
+    v_f32 vy0, vy1, vy2, vy3;
+    v_f32 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f32(x + i);
+        vx1 = v_loadu_f32(x + i + vstep);
+        vx2 = v_loadu_f32(x + i + vstep * 2);
+        vx3 = v_loadu_f32(x + i + vstep * 3);
+        vy0 = v_loadu_f32(y + i);
+        vy1 = v_loadu_f32(y + i + vstep);
+        vy2 = v_loadu_f32(y + i + vstep * 2);
+        vy3 = v_loadu_f32(y + i + vstep * 3);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt1 = v_mul_f32(__s, vy1);
+        vt2 = v_mul_f32(__s, vy2);
+        vt3 = v_mul_f32(__s, vy3);
+
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        vt1 = v_muladd_f32(__c, vx1, vt1);
+        vt2 = v_muladd_f32(__c, vx2, vt2);
+        vt3 = v_muladd_f32(__c, vx3, vt3);
+
+        v_storeu_f32(x + i, vt0);
+        v_storeu_f32(x + i + vstep, vt1);
+        v_storeu_f32(x + i + vstep * 2, vt2);
+        v_storeu_f32(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt1 = v_mul_f32(__s, vx1);
+        vt2 = v_mul_f32(__s, vx2);
+        vt3 = v_mul_f32(__s, vx3);
+
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        vt1 = v_mulsub_f32(__c, vy1, vt1);
+        vt2 = v_mulsub_f32(__c, vy2, vt2);
+        vt3 = v_mulsub_f32(__c, vy3, vt3);
+
+        v_storeu_f32(y + i, vt0);
+        v_storeu_f32(y + i + vstep, vt1);
+        v_storeu_f32(y + i + vstep * 2, vt2);
+        v_storeu_f32(y + i + vstep * 3, vt3);
+
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f32(x + i);
+        vy0 = v_loadu_f32(y + i);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        v_storeu_f32(x + i, vt0);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        v_storeu_f32(y + i, vt0);
+    }
+#else
     FLOAT f0, f1, f2, f3;
     FLOAT x0, x1, x2, x3;
     FLOAT g0, g1, g2, g3;
@@ -20,7 +88,6 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
     FLOAT* yp = y;
 
     BLASLONG n1 = n & (~7);
-
     while (i < n1) {
         x0 = xp[0];
         y0 = yp[0];
@@ -53,6 +120,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         yp += 4;
         i += 4;
     }
+#endif
 
     while (i < n) {
         FLOAT temp = c*x[i] + s*y[i];
diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/riscv64/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index 2047e4776..7789c83c7 100644
--- a/param.h
+++ b/param.h
@@ -2676,6 +2676,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  16
 #endif
 
+#ifdef RISCV64_GENERIC
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
+
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
+#ifdef C910V
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2
diff --git a/test/Makefile b/test/Makefile
index eb3bc3447..1ecce0be7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -258,6 +258,12 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+
+ifeq ($(CORE), C910V)
+EXTRALIB =
+CEXTRALIB =
+endif
+
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dc5175fc5..357e61301 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -27,13 +27,17 @@ endif ()
 
 # known to hang with the native Windows and Android threads
 # FIXME needs checking if this works on any of the other platforms
-if (NOT USE_OPENMP)
 if (OS_CYGWIN_NT OR OS_LINUX)
+if (NOT USE_OPENMP)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_fork.c
   )
 endif()
+set(OpenBLAS_utest_src
+  ${OpenBLAS_utest_src}
+  test_post_fork.c
+  )
 endif()
 
 if (NOT NO_LAPACK)
diff --git a/utest/Makefile b/utest/Makefile
index 31d4ccf00..ac8c6f72a 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -25,10 +25,11 @@ endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch
-ifndef USE_OPENMP
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
+ifneq ($(USE_OPENMP), 1)
 OBJS += test_fork.o
 endif
+OBJS += test_post_fork.o
 endif
 
 ifeq ($(C_COMPILER), PGI)
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 5c976f920..bd531e7fb 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cblas.h>
 #include "openblas_utest.h"
 
-void* xmalloc(size_t n)
+static void* xmalloc(size_t n)
 {
     void* tmp;
     tmp = malloc(n);
@@ -49,7 +49,7 @@ void* xmalloc(size_t n)
 }
 
 #ifdef BUILD_DOUBLE
-void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
 {
     char trans1 = 'T';
     char trans2 = 'N';
diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c
new file mode 100644
index 000000000..9370a02ce
--- /dev/null
+++ b/utest/test_post_fork.c
@@ -0,0 +1,131 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <cblas.h>
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+#include "openblas_utest.h"
+
+static void* xmalloc(size_t n)
+{
+    void* tmp;
+    tmp = malloc(n);
+    if (tmp == NULL) {
+        fprintf(stderr, "You are about to die\n");
+        exit(1);
+    } else {
+        return tmp;
+    }
+}
+
+#ifdef BUILD_DOUBLE
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+{
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
+    int i;
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
+    for(i = 0; i < n * n; ++i) {
+        ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+CTEST(fork, safety_after_fork_in_parent)
+{
+#ifndef BUILD_DOUBLE
+exit(0);
+#else
+    blasint n = 100;
+    int i, nthreads_omp;
+
+    double *a, *b, *c, *d;
+    size_t n_bytes;
+
+    pid_t fork_pid;
+
+    n_bytes = sizeof(*a) * n * n;
+
+    a = xmalloc(n_bytes);
+    b = xmalloc(n_bytes);
+    c = xmalloc(n_bytes);
+    d = xmalloc(n_bytes);
+
+    // Put ones in a, b and n in c (result)
+    for(i = 0; i < n * n; ++i) {
+        a[i] = 1;
+        b[i] = 1;
+        c[i] = 1 * n;
+    }
+
+    // Test that OpenBLAS works after a fork.
+    // This situation routinely happens with Pythons numpy where a
+    // `sys.platform` calls `uname` in a forked process.
+    // So we simulate this situation here.
+
+    // There was an issue where a different number of OpenBLAS and OpenMP
+    // threads triggered a memory leak. So run this multiple times
+    // with different number of threads set.
+#ifdef USE_OPENMP
+    nthreads_omp = omp_get_max_threads();
+    // Run with half the max OMP threads, the max threads and twice that
+    for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) {
+        omp_set_num_threads(i);
+#endif
+
+        fork_pid = fork();
+        if (fork_pid == -1) {
+            CTEST_ERR("Failed to fork process.");
+        } else if (fork_pid == 0) {
+            // Just pretend to do something, e.g. call `uname`, then exit
+            exit(0);
+        } else {
+            // Wait for the child to finish and check the exit code.
+            int child_status = 0;
+            pid_t wait_pid = wait(&child_status);
+            ASSERT_EQUAL(wait_pid, fork_pid);
+            ASSERT_EQUAL(0, WEXITSTATUS (child_status));
+
+            // Now OpenBLAS has to work
+            check_dgemm(a, b, d, c, n);
+        }
+#ifdef USE_OPENMP
+    }
+#endif
+
+#endif
+}