diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 39ec96246..5378c79bf 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -201,3 +201,6 @@ In chronological order:
 * Bine Brank <https://github.com/binebrank>
   * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
   * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
+  * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
+  * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
+  * [2022-01-18] SVE kernels and copy functions for TRSM
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 9849ddc93..8aa6728d5 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -323,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 
 
         #hemm
-      GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
+if (NOT DEFINED ${float_char}HEMMUTCOPY_M)
+    set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+    set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+    set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}")
+    set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}")
+endif()
+      GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type})
 
       # symm for c and z
+if (NOT DEFINED ${float_char}SYMMUCOPY_M)
+	set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
+	set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
+endif()
       GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
 
       GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
+	set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
+	set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
+	set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
+	set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
+endif ()
+      GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED ZTRSMCOPYLN_M)
+  set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}")
+  set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}")
+  set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}")
+  set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 
@@ -465,23 +503,35 @@ endif ()
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED TRSMCOPYLN_M)
+  set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}")
+  set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}")
+  set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}")
+  set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index d22bd46a5..2d5740183 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1691,29 +1691,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef CTRMMUNCOPY_M
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMLNCOPY_M
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef CTRMMUTCOPY_M
+$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMLTCOPY_M
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1739,29 +1771,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRMMUNCOPY_M
+$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMLNCOPY_M
+$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMUTCOPY_M
+$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMLTCOPY_M
+$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1897,11 +1961,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N)
 $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef CSYMMUCOPY_M
+$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef CSYMMLCOPY_M
+$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1909,11 +1983,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N)
 $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef ZSYMMUCOPY_M
+$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef ZSYMMLCOPY_M
+$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1933,11 +2017,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N
 $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef CHEMMUTCOPY_M
+$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef CHEMMLTCOPY_M
+$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
@@ -1945,11 +2039,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N
 $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef ZHEMMUTCOPY_M
+$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef ZHEMMLTCOPY_M
+$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
@@ -2287,29 +2391,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR
 $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
 	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLN_M
+$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYUT_M
+$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLT_M
+$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2335,29 +2471,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N
 $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLN_M
+$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYUT_M
+$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLT_M
+$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2431,29 +2599,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2479,29 +2679,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index 80be4ddd0..bd25f7cd8 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c
 ISMINKERNEL  = ../arm/imin.c
 IDMINKERNEL  = ../arm/imin.c
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
 
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 SAMAXKERNEL  = amax.S
 DAMAXKERNEL  = amax.S
@@ -156,28 +167,50 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
 DSYMMUCOPY_M    =  symm_ucopy_sve.c
 DSYMMLCOPY_M    =  symm_lcopy_sve.c
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 0364a929c..bd25f7cd8 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c
 ISMINKERNEL  = ../arm/imin.c
 IDMINKERNEL  = ../arm/imin.c
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
 
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
 
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 SAMAXKERNEL  = amax.S
 DAMAXKERNEL  = amax.S
@@ -140,8 +151,8 @@ DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
 DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
 DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
 DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -156,28 +167,50 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
 DSYMMUCOPY_M    =  symm_ucopy_sve.c
 DSYMMLCOPY_M    =  symm_lcopy_sve.c
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..38770f66b
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s4
+#define alpha0_I	s5
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	ld2w	{z30.s, z31.s}, p1/z, [pCRow3]
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lcgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lcgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lcgemm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lcgemm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lcgemm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L4_Mv1_22
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lcgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lcgemm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lcgemm_kernel_L4_Mv1_44
+
+
+.Lcgemm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lcgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lcgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lcgemm_kernel_L4_Mv1_46
+
+.Lcgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lcgemm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lcgemm_kernel_L4_Mv1_20   
+
+
+
+.Lcgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 4 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lcgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lcgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lcgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lcgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lcgemm_kernel_L2_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_22
+
+
+.Lcgemm_kernel_L2_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L2_Mv1_100
+
+.Lcgemm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_42
+
+.Lcgemm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+.Lcgemm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L2_Mv1_20   
+
+
+.Lcgemm_kernel_L2_END:
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 4 * 2
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lcgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+.Lcgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lcgemm_kernel_L1_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_22
+
+
+.Lcgemm_kernel_L1_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L1_Mv1_100
+
+.Lcgemm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_42
+
+.Lcgemm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+.Lcgemm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L1_Mv1_20   
+
+.Lcgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..6aa44a8f6
--- /dev/null
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint32_t lda_vec = svindex_s32(0, lda * 2);
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
+            svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec);
+            svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active * 2;
+        }
+        aoffset += active * lda * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..748cd954e
--- /dev/null
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -0,0 +1,75 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1);
+            svst2_f32(pg, (float *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += active * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..242968f63
--- /dev/null
+++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s6
+#define alpha0_I	s7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lctrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lctrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lctrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lctrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lctrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lctrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lctrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lctrmm_kernel_L4_Mv1_44
+
+
+.Lctrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lctrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lctrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lctrmm_kernel_L4_Mv1_46
+
+.Lctrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lctrmm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lctrmm_kernel_L4_Mv1_20   
+
+
+
+.Lctrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lctrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lctrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lctrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lctrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lctrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_22
+
+
+.Lctrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L2_Mv1_100
+
+.Lctrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_42
+
+.Lctrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L2_Mv1_20   
+
+
+.Lctrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lctrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lctrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lctrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_22
+
+
+.Lctrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L1_Mv1_100
+
+.Lctrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_42
+
+.Lctrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L1_Mv1_20   
+
+.Lctrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
new file mode 100644
index 000000000..fa1c6e984
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -0,0 +1,320 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+        *(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+        *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+        *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % sve_size;
+    if (i) {
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    int mod = i;
+    i = sve_size;
+    if (i <= m) {
+      aa = a + (m - mod - sve_size) * k * COMPSIZE;
+      cc = c + (m - mod - sve_size)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + sve_size * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(sve_size, GEMM_UNROLL_N,
+            aa + (kk - sve_size) * sve_size * COMPSIZE,
+            b  + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= sve_size * k * COMPSIZE;
+        cc -= sve_size     * COMPSIZE;
+        kk -= sve_size;
+
+        i += sve_size;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % sve_size;
+        if (i) {
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        int mod = i;
+        i = sve_size;
+        if (i <= m) {
+          aa = a + (m - mod - sve_size) * k * COMPSIZE;
+          cc = c + (m - mod - sve_size)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - sve_size) * sve_size * COMPSIZE,
+                b  + (kk - sve_size) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= sve_size * k * COMPSIZE;
+            cc -= sve_size     * COMPSIZE;
+            kk -= sve_size;
+
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
new file mode 100644
index 000000000..2cbb2aafb
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -0,0 +1,295 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(sve_size, GEMM_UNROLL_N,
+          aa + kk * sve_size * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += sve_size * k * COMPSIZE;
+      cc += sve_size     * COMPSIZE;
+      kk += sve_size;
+      i += sve_size;
+    }
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = sve_size;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(sve_size, j,
+              aa + kk * sve_size * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += sve_size * k * COMPSIZE;
+          cc += sve_size     * COMPSIZE;
+          kk += sve_size;
+          i += sve_size;
+        }
+
+        i = m % sve_size;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c
new file mode 100644
index 000000000..5e4e8d9b1
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RN_sve.c
@@ -0,0 +1,293 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(sve_size, GEMM_UNROLL_N,
+	      aa + kk * sve_size * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += sve_size * k * COMPSIZE;
+	cc += sve_size     * COMPSIZE;
+	i += sve_size;
+      } while (i <= m);
+    }
+
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = sve_size;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(sve_size, j,
+		aa + kk * sve_size * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
+	  i += sve_size;
+	}
+
+  i = m % sve_size;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c
new file mode 100644
index 000000000..c376c0e33
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RT_sve.c
@@ -0,0 +1,317 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = sve_size;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - j) * sve_size * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += sve_size * k * COMPSIZE;
+            cc += sve_size     * COMPSIZE;
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        i = m % sve_size;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = sve_size;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + sve_size * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(sve_size, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
+	  i += sve_size;
+	} while (i <= m);
+      }
+
+      i = m % sve_size;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
new file mode 100644
index 000000000..5a9d4194a
--- /dev/null
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(ao + k * lda + j);
+          }
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
new file mode 100644
index 000000000..ac4019e26
--- /dev/null
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -0,0 +1,117 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(ao + j * lda + k);
+          }
+        }
+        b += n_active * n_active;
+        ao += lda * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
+      }
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
new file mode 100644
index 000000000..8fdcd0f4b
--- /dev/null
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(ao + k * lda + j);
+          }
+        }
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
new file mode 100644
index 000000000..0f5f0dccd
--- /dev/null
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -0,0 +1,117 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(ao + j * lda + k);
+          }
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += lda * n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
+      } 
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..d5b35775c
--- /dev/null
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		x19
+#define alphaI		x20
+
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #4
+
+	ld2d	{z30.d, z31.d}, p1/z, [pCRow3]
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lzgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lzgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lzgemm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lzgemm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lzgemm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_Mv1_22
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lzgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lzgemm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lzgemm_kernel_L4_Mv1_44
+
+
+.Lzgemm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lzgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lzgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lzgemm_kernel_L4_Mv1_46
+
+.Lzgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lzgemm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lzgemm_kernel_L4_Mv1_20   
+
+
+
+.Lzgemm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lzgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lzgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lzgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lzgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lzgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lzgemm_kernel_L2_Mv1_40
+	.align 5
+
+.Lzgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_Mv1_22
+
+
+.Lzgemm_kernel_L2_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L2_Mv1_100
+
+.Lzgemm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_Mv1_42
+
+.Lzgemm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+.Lzgemm_kernel_L2_Mv1_END:
+
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L2_Mv1_20   
+
+
+.Lzgemm_kernel_L2_END:
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lzgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+.Lzgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lzgemm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L1_Mv1_40
+	.align 5
+
+.Lzgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_Mv1_22
+
+
+.Lzgemm_kernel_L1_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L1_Mv1_100
+
+.Lzgemm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_Mv1_42
+
+.Lzgemm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+.Lzgemm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L1_Mv1_20   
+
+.Lzgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..8f9b4268a
--- /dev/null
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint64_t lda_vec = svindex_s64(0LL, lda * 2);
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
+            svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
+            svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active * 2;
+        }
+        aoffset += active * lda * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..c6e50bc1c
--- /dev/null
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -0,0 +1,75 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1);
+            svst2_f64(pg, (double *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += active * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
new file mode 100644
index 000000000..37dbfe4e1
--- /dev/null
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -0,0 +1,172 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+#if defined(DOUBLE)
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
new file mode 100644
index 000000000..21e03b7be
--- /dev/null
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -0,0 +1,172 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+#if defined(DOUBLE)
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+#else
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
new file mode 100644
index 000000000..6f18aa956
--- /dev/null
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
new file mode 100644
index 000000000..6be48cdaf
--- /dev/null
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..b71a3d39e
--- /dev/null
+++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		x19
+#define alphaI		x20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #4
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lztrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lztrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lztrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lztrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lztrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lztrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lztrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lztrmm_kernel_L4_Mv1_44
+
+
+.Lztrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lztrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lztrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lztrmm_kernel_L4_Mv1_46
+
+.Lztrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lztrmm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lztrmm_kernel_L4_Mv1_20   
+
+
+
+.Lztrmm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lztrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lztrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lztrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lztrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lztrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_22
+
+
+.Lztrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L2_Mv1_100
+
+.Lztrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_42
+
+.Lztrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lztrmm_kernel_L2_Mv1_END:
+
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L2_Mv1_20   
+
+
+.Lztrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lztrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lztrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lztrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_22
+
+
+.Lztrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L1_Mv1_100
+
+.Lztrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_42
+
+.Lztrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lztrmm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L1_Mv1_20   
+
+.Lztrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
new file mode 100644
index 000000000..d34f607ab
--- /dev/null
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
new file mode 100644
index 000000000..7f34c9857
--- /dev/null
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
new file mode 100644
index 000000000..7eb9452c9
--- /dev/null
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
new file mode 100644
index 000000000..60c8ff3b4
--- /dev/null
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -0,0 +1,141 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
new file mode 100644
index 000000000..eb7cd0294
--- /dev/null
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
new file mode 100644
index 000000000..34dbf8a30
--- /dev/null
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+        }
+        b += n_active * n_active * 2;
+        ao += lda * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      }
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
new file mode 100644
index 000000000..92e086b75
--- /dev/null
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
new file mode 100644
index 000000000..ccb942e1b
--- /dev/null
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += lda * n_active;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      } 
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/param.h b/param.h
index e6bfb1d91..8649e4486 100644
--- a/param.h
+++ b/param.h
@@ -3395,11 +3395,13 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #define DGEMM_DEFAULT_UNROLL_MN  32
 
-#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  16
 
-#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  16
 
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	160