diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
new file mode 100644
index 000000000..ca53e8857
--- /dev/null
+++ b/.github/workflows/dynamic_arch.yml
@@ -0,0 +1,103 @@
+name: continuous build
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        fortran: [gfortran, flang]
+        build: [cmake, make]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Compilation cache
+        uses: actions/cache@v2
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          key: ${{ runner.os }}-ccache-${{ github.sha }}
+          # Restore any ccache cache entry, if none for
+          # ${{ runner.os }}-ccache-${{ github.sha }} exists
+          restore-keys: |
+            ${{ runner.os }}-ccache-
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            sysctl -a | grep machdep.cpu
+          else
+            echo "$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get install -y gfortran cmake ccache
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            brew install coreutils cmake ccache
+          else
+            echo "$RUNNER_OS not supported"
+            exit 1
+          fi
+          ccache -M 300M  # Limit the ccache size; Github's overall cache limit is 5GB
+
+      - name: gfortran build
+        if: matrix.build == 'make' && matrix.fortran == 'gfortran'
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            export PATH="/usr/lib/ccache:${PATH}"
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
+          else
+            echo "$RUNNER_OS not supported"
+            exit 1
+          fi
+
+          make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
+
+      - name: flang build
+        if: matrix.build == 'make' && matrix.fortran == 'flang'
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            export PATH="/usr/lib/ccache:${PATH}"
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            exit 0
+          else
+            echo "$RUNNER_OS not supported"
+            exit 1
+          fi
+
+          cd /usr/
+          sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
+          sudo tar xf flang-20190329-x86-70.tgz
+          sudo rm flang-20190329-x86-70.tgz
+          cd -
+
+          make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang
+
+
+      - name: CMake gfortran build
+        if: matrix.build == 'cmake' && matrix.fortran == 'gfortran'
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            export PATH="/usr/lib/ccache:${PATH}"
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
+          else
+            echo "$RUNNER_OS not supported"
+            exit 1
+          fi
+
+          mkdir build
+          cd build
+          cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0  -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
+          make -j$(nproc)
diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index f55e73d23..8d7cfea2d 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -21,6 +21,7 @@ jobs:
   build-OpenBLAS-with-Homebrew:
     runs-on: macos-latest
     env:
+      DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
       HOMEBREW_DEVELOPER: "ON"
       HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
       HOMEBREW_NO_ANALYTICS: "ON"
diff --git a/.gitignore b/.gitignore
index 6803a919e..bca79f043 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,7 @@ test/SBLAT2.SUMM
 test/SBLAT3.SUMM
 test/ZBLAT2.SUMM
 test/ZBLAT3.SUMM
+test/SHBLAT3.SUMM
 test/cblat1
 test/cblat2
 test/cblat3
@@ -79,6 +80,7 @@ test/dblat3
 test/sblat1
 test/sblat2
 test/sblat3
+test/test_shgemm
 test/zblat1
 test/zblat2
 test/zblat3
diff --git a/.travis.yml b/.travis.yml
index c875572b2..307010e40 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,7 +16,6 @@ matrix:
       before_script: &common-before
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
       script:
-        - set -e
         - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
@@ -76,6 +75,23 @@ matrix:
         - TARGET_BOX=LINUX32
         - BTYPE="BINARY=32"
 
+    - os: linux
+      arch: ppc64le
+      dist: bionic
+      compiler: gcc
+      before_script:
+        - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
+        - sudo apt-get update
+        - sudo apt-get install gcc-9 gfortran-9 -y
+      script:
+        - make QUIET_MAKE=1  BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
+        - make -C test $COMMON_FLAGS $BTYPE
+        - make -C ctest $COMMON_FLAGS $BTYPE
+        - make -C utest $COMMON_FLAGS $BTYPE 
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX_P9
+
     - os: linux
       compiler: gcc
       addons:
@@ -108,7 +124,6 @@ matrix:
         - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
       before_script: *common-before
       script:
-        - set -e
         # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
         - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
               CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
@@ -151,7 +166,6 @@ matrix:
       before_script:
         - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
       script:
-        - set -e
         - mkdir build
         - CONFIG=Release
         - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2b9ae7ad..4b82d7670 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 9.dev)
+set(OpenBLAS_PATCH_VERSION 10.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
@@ -23,6 +23,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun
 option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
 option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
 option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
+option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
 option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
 else()
@@ -86,10 +87,13 @@ if (NOT NO_LAPACK)
   list(APPEND SUBDIRS lapack)
 endif ()
 
+if (NOT DEFINED BUILD_HALF)
+ set (BUILD_HALF false)
+endif ()
 # set which float types we want to build for
 if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
   # if none are defined, build for all
-  set(BUILD_HALF true)
+#  set(BUILD_HALF true)
   set(BUILD_SINGLE true)
   set(BUILD_DOUBLE true)
   set(BUILD_COMPLEX true)
@@ -121,7 +125,7 @@ if (BUILD_COMPLEX16)
   list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
 endif ()
 
-if (BUILD_SINGLE OR BUILD_HALF)
+if (BUILD_HALF)
   message(STATUS "Building Half Precision")
   list(APPEND FLOAT_TYPES "HALF") # defines nothing
 endif ()
@@ -229,6 +233,7 @@ if (NOT MSVC AND NOT NOFORTRAN)
   if(NOT NO_CBLAS)
     add_subdirectory(ctest)
   endif()
+  add_subdirectory(lapack-netlib/TESTING)
 endif()
 
 set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
@@ -244,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
   endif()
 endif()
 
-if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
+if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
 if (NOT DEFINED ARCH)
   set(ARCH_IN "x86_64")
 else()
@@ -353,10 +358,21 @@ endif()
 
 if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
-
 	set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
 	file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
 	string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	if (NOT ${SYMBOLPREFIX} STREQUAL "")
+	string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+	string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	endif()
+	if (NOT ${SYMBOLSUFFIX} STREQUAL "")
+	string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+	string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+	string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+	endif()
 	file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
 	install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
@@ -373,11 +389,9 @@ if(NOT NO_LAPACKE)
 	install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
 endif()
 
-include(FindPkgConfig QUIET)
-if(PKG_CONFIG_FOUND)
-	configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
-	install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
-endif()
+# Install pkg-config files
+configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
+install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
 
 
 # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index df497c1d2..aba39e56f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -180,3 +180,13 @@ In chronological order:
   * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
   * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
   * [2020-01-07] optimize AVX2 SGEMM and STRMM
+
+* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
+  * [2020-04-15] Half-precision GEMM for bfloat16
+
+* Marius Hillenbrand <https://github.com/mhillenibm>
+  * [2020-05-12] Revise dynamic architecture detection for IBM z
+  * [2020-05-12] Add new sgemm and strmm kernel for IBM z14
+
+* Danfeng Zhang <https://github.com/craft-zhang>
+  * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
\ No newline at end of file
diff --git a/Changelog.txt b/Changelog.txt
index 5f924629b..cbf0b50f5 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,77 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.10
+ 14-Jun-2020
+
+common:
+	* Improved thread locking behaviour in blas_server and parallel getrf                         
+	* Imported bugfix 394 from LAPACK (spurious reference to "XERBL"
+	  due to overlong lines)
+	* Imported bugfix 403 from LAPACK (compile option "recursive" required
+	  for correctness with Intel and PGI)
+	* Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB)
+	* Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP)
+	* Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that
+	  could lead to crashes at large matrix sizes  
+	* Restored internal soname in dynamic libraries on FreeBSD and Dragonfly 
+	* Added API (openblas_setaffinity) to set the thread affinity on Linux
+	* Added initial infrastructure for half-precision floating point 
+	  (bfloat16) support with a generic implementation of SHGEMM     
+	* Added CMAKE build system support for building the cblas_Xgemm3m
+	  functions
+	* Fixed CMAKE support for building in a path with embedded spaces
+	* Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC
+	* Fixed GCC version detection in the Makefiles
+	* Allowed overriding the names of AR, AS and LD in Makefile builds
+
+POWER:
+	* Fixed big-endian POWER8 ELFv2 builds on FreeBSD
+	* Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9
+	* Fixed CMAKE build support for POWER9
+	* fixed a potential race condition in the thread buffer allocation
+	* Worked around LAPACK test failures on PPC G4
+
+MIPS:
+	* Fixed a potential race condition in the thread buffer allocation
+	* Added support for MIPS 24K/24KE family based on P5600 kernels
+
+MIPS64:
+	* fixed a potential race condition in the thread buffer allocation
+	* Added TARGET=GENERIC
+
+ARMV7:
+	* Fixed a race condition in the thread buffer allocation
+
+ARMV8:
+	* Fixed a race condition in the thread buffer allocation
+	* Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA
+	* Improved performance of the ThunderX2 DAXPY kernel
+	* Added an optimized SGEMM kernel for Cortex A53    
+	* Fixed Makefile support for INTERFACE64 (8-byte integer)
+
+x86_64:
+	* Fixed a syntax error in the CMAKE setup for SkylakeX
+	* Improved performance of STRSM on Haswell, SkylakeX and Ryzen
+	* Improved SGEMM performance on SGEMM for workloads with ldc a
+	  multiple of 1024
+	* Improved DGEMM performance on Skylake X
+	* Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH
+	  builds created on SkylakeX
+	* Removed data alignment requirement in the SSE2 copy kernels
+	  that could cause spurious crashes
+	* Added a workaround for an optimizer bug in AppleClang 11.0.3
+	* Fixed LAPACK test failures due to wrong options for Intel Fortran
+	* Fixed compilation and LAPACK test results with recent Flang
+	  and AMD AOCC
+	* Fixed DYNAMIC_ARCH builds with CMAKE on OS X
+	* Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max,
+	  cblas_?sum, cblas_?gemm3m in the shared library on OS 
+	* Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes
+	  show the name of an older generation chip supported by the same kernels)
+
+IBM Z:
+	* Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14
+
 ====================================================================
 Version 0.3.9
  1-Mar-2020
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 000000000..2b61bed9f
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,9 @@
+node {
+        stage('Checkout') {
+            checkout
+        }
+
+        stage('Build') {
+            sh("make")
+        }
+}
diff --git a/Makefile b/Makefile
index 18320e6a3..7a03b08f0 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ ifndef NO_FBLAS
 	$(MAKE) -C test all
 endif
 	$(MAKE) -C utest all
-ifndef NO_CBLAS
+ifneq ($(NO_CBLAS), 1)
 	$(MAKE) -C ctest all
 ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
 	$(MAKE) -C cpp_thread_test all
@@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
 endif
-ifndef NO_LAPACKE
+ifneq ($(NO_LAPACKE), 1)
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
 endif
 endif
@@ -264,6 +264,7 @@ lapack_prebuild :
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FFLAGS_DRV  = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -364,11 +365,12 @@ clean ::
 	@$(MAKE) -C kernel clean
 #endif
 	@$(MAKE) -C reference clean
-	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
+	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
 ifeq ($(OSNAME), Darwin)
 	@rm -rf getarch.dSYM getarch_2nd.dSYM
 endif
 	@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
+	@rm -f cblas.tmp cblas.tmp2
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
diff --git a/Makefile.arm64 b/Makefile.arm64
index a7cd82e3a..1091edfe5 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
 
+ifeq ($(CORE), THUNDERX3T110)
+ifeq ($(GCCVERSIONGTEQ10), 1)
+CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+else
+CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+endif
+endif
+
 ifeq ($(GCCVERSIONGTEQ9), 1)
 ifeq ($(CORE), TSV110)
 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
diff --git a/Makefile.install b/Makefile.install
index dad869f4c..7c1a3ca43 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
 OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
 OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
+PKG_EXTRALIB := $(EXTRALIB)
+ifeq ($(USE_OPENMP), 1)
+  ifeq ($(C_COMPILER), PGI)
+          PKG_EXTRALIB += -lomp
+  else	  
+          PKG_EXTRALIB += -lgomp
+  endif
+endif	  
 
 .PHONY : install
 .NOTPARALLEL : install
@@ -45,7 +53,22 @@ install : 	lib.grd
 
 ifndef NO_CBLAS
 	@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
-	@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
+	@cp cblas.h cblas.tmp
+ifdef SYMBOLPREFIX
+	@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
+	#change back any openblas_complex_float and double that got hit
+	@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g'  cblas.tmp > cblas.tmp2
+	@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g'  cblas.tmp2 > cblas.tmp
+endif
+ifdef SYMBOLSUFFIX
+	@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
+	@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
+	#change back any openblas_complex_float and double that got hit
+	@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g'  cblas.tmp > cblas.tmp2
+	@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g'  cblas.tmp2 > cblas.tmp
+endif
+	@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
 endif
 
 ifneq ($(OSNAME), AIX)
@@ -132,7 +155,7 @@ endif
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
+	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
 
 
@@ -168,4 +191,3 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
-
diff --git a/Makefile.power b/Makefile.power
index 24d8aa8a7..37a02d692 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -9,23 +9,63 @@ else
 USE_OPENMP = 1
 endif
 
+ifeq ($(CORE), POWER10)
+COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
+endif
+
 ifeq ($(CORE), POWER9)
-ifeq ($(USE_OPENMP), 1)
-COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
-FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+ifneq ($(C_COMPILER), PGI)
+CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
+ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -mcpu=power8 -mtune=power8 
 else
-COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
-FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
+CCOMMON_OPT += -mcpu=power9 -mtune=power9 
+endif
+else
+CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
+endif
+ifneq ($(F_COMPILER), PGI)
+FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+FCOMMON_OPT += -mcpu=power8 -mtune=power8 
+else
+FCOMMON_OPT += -mcpu=power9 -mtune=power9 
+endif
+else
+FCOMMON_OPT += -O2 -Mrecursive
 endif
 endif
 
 ifeq ($(CORE), POWER8)
-ifeq ($(USE_OPENMP), 1)
-COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
-FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+ifneq ($(C_COMPILER), PGI)
+CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
 else
-COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
-FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
+CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
+endif
+ifneq ($(F_COMPILER), PGI)
+ifeq ($(OSNAME), AIX)
+FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+else
+FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+endif
+else
+FCOMMON_OPT += -O2 -Mrecursive
+endif
+endif
+
+ifeq ($(USE_OPENMP), 1)
+ifneq ($(C_COMPILER), PGI)
+CCOMMON_OPT += -DUSE_OPENMP -fopenmp
+else
+CCOMMON_OPT += -DUSE_OPENMP -mp
+endif
+ifneq ($(F_COMPILER), PGI)
+FCOMMON_OPT += -DUSE_OPENMP -fopenmp
+else
+FCOMMON_OPT += -DUSE_OPENMP -mp
 endif
 endif
 
@@ -68,6 +108,9 @@ CCOMMON_OPT	+= -mpowerpc64 -maix64
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT	+= -mpowerpc64 -maix64
 endif
+ifeq ($(F_COMPILER), GFORTRAN)
+FCOMMON_OPT	+= -mpowerpc64 -maix64
+endif
 ifeq ($(COMPILER_F77), xlf)
 FCOMMON_OPT	+= -q64
 endif
diff --git a/Makefile.rule b/Makefile.rule
index 724a60ec4..2c12177ee 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.9.dev
+VERSION = 0.3.10.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -273,6 +273,9 @@ COMMON_PROF = -pg
 #
 # CPP_THREAD_SAFETY_TEST = 1
 
+
+# If you want to enable the experimental BFLOAT16 support
+# BUILD_HALF = 1
 #
 #  End of user configuration
 #
diff --git a/Makefile.system b/Makefile.system
index cc30b05a1..4eec20675 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -21,8 +21,14 @@ ifeq ($(ARCH), amd64)
 override ARCH=x86_64
 else ifeq ($(ARCH), powerpc64)
 override ARCH=power
+else ifeq ($(ARCH), powerpc)
+override ARCH=power
 else ifeq ($(ARCH), i386)
 override ARCH=x86
+else ifeq ($(ARCH), armv6)
+override ARCH=arm
+else ifeq ($(ARCH), armv7)
+override ARCH=arm
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
 else ifeq ($(ARCH), zarch)
@@ -86,6 +92,9 @@ endif
 ifeq ($(TARGET), SKYLAKEX)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET), COOPERLAKE)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -107,6 +116,9 @@ endif
 ifeq ($(TARGET), ARMV8)
 GETARCH_FLAGS := -DFORCE_ARMV7
 endif
+ifeq ($(TARGET), POWER8)
+GETARCH_FLAGS := -DFORCE_POWER6
+endif
 endif
 
 
@@ -125,6 +137,9 @@ endif
 ifeq ($(TARGET_CORE), SKYLAKEX)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET_CORE), COOPERLAKE)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET_CORE), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -266,10 +281,10 @@ endif
 
 ARFLAGS	=
 CPP	= $(COMPILER) -E
-AR	= $(CROSS_SUFFIX)ar
-AS	= $(CROSS_SUFFIX)as
-LD	= $(CROSS_SUFFIX)ld
-RANLIB	= $(CROSS_SUFFIX)ranlib
+AR	?= $(CROSS_SUFFIX)ar
+AS	?= $(CROSS_SUFFIX)as
+LD	?= $(CROSS_SUFFIX)ld
+RANLIB	?= $(CROSS_SUFFIX)ranlib
 NM	= $(CROSS_SUFFIX)nm
 DLLWRAP = $(CROSS_SUFFIX)dllwrap
 OBJCOPY = $(CROSS_SUFFIX)objcopy
@@ -282,6 +297,26 @@ NO_LAPACK = 1
 override FEXTRALIB = 
 endif
 
+ifeq ($(C_COMPILER), GCC)
+GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
+GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5)
+GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
+GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
+GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
+GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
+# Note that the behavior of -dumpversion is compile-time-configurable for
+# gcc-7.x and newer. Use -dumpfullversion there
+ifeq ($(GCCVERSIONGTEQ7),1)
+	GCCDUMPVERSION_PARAM := -dumpfullversion
+else
+	GCCDUMPVERSION_PARAM := -dumpversion
+endif
+GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
+GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
+endif
+
 #
 #  OS dependent settings
 #
@@ -328,13 +363,7 @@ ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT	+= -DMS_ABI
 endif
 
-ifeq ($(C_COMPILER), GCC)
 #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
-GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGT4), 1)
 # GCC Major version > 4
 # It is compatible with MSVC ABI.
@@ -348,7 +377,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
 CCOMMON_OPT	+= -DMS_ABI
 endif
 endif
-endif
 
 # Ensure the correct stack alignment on Win32
 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
@@ -540,7 +568,7 @@ DYNAMIC_CORE += HASWELL ZEN
 endif
 ifneq ($(NO_AVX512), 1)
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += SKYLAKEX
+DYNAMIC_CORE += SKYLAKEX COOPERLAKE
 endif
 endif
 endif
@@ -565,11 +593,38 @@ DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
 DYNAMIC_CORE += EMAG8180
+DYNAMIC_CORE += THUNDERX3T110
 endif
 
 ifeq ($(ARCH), zarch)
-DYNAMIC_CORE = Z13
+DYNAMIC_CORE = ZARCH_GENERIC
+
+# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
+ifeq ($(GCCVERSIONGT5), 1)
+	ZARCH_SUPPORT_Z13 := 1
+else ifeq ($(GCCVERSIONEQ5), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+	ZARCH_SUPPORT_Z13 := 1
+endif
+endif
+
+ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
+ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1)
+	ZARCH_SUPPORT_Z13 := 1
+endif
+endif
+
+ifeq ($(ZARCH_SUPPORT_Z13), 1)
+DYNAMIC_CORE += Z13
+else
+$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
+endif
+
+ifeq ($(GCCVERSIONGTEQ7), 1)
 DYNAMIC_CORE += Z14
+else
+$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
+endif
 endif
 
 ifeq ($(ARCH), power)
@@ -577,14 +632,23 @@ DYNAMIC_CORE = POWER6
 DYNAMIC_CORE += POWER8
 ifneq ($(C_COMPILER), GCC)
 DYNAMIC_CORE += POWER9
+DYNAMIC_CORE += POWER10
 endif
 ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
 ifeq ($(GCCVERSIONGT5), 1)
 DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
+ifeq ($(GCCVERSIONGTEQ11), 1)
+DYNAMIC_CORE += POWER10
+else ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+DYNAMIC_CORE += POWER10
+endif
+else
+$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
+endif
 endif
 endif
 
@@ -745,8 +809,19 @@ endif
 
 ifeq ($(C_COMPILER), PGI)
 ifdef BINARY64
+ifeq ($(ARCH), x86_64)
 CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
 else
+ifeq ($(ARCH), power)
+ifeq ($(CORE), POWER8)
+CCOMMON_OPT += -tp pwr8
+endif
+ifeq ($(CORE), POWER9)
+CCOMMON_OPT += -tp pwr9
+endif
+endif
+endif
+else
 CCOMMON_OPT += -tp p7
 endif
 endif
@@ -765,6 +840,15 @@ endif
 
 ifeq ($(F_COMPILER), FLANG)
 CCOMMON_OPT += -DF_INTERFACE_FLANG
+FCOMMON_OPT += -Mrecursive -Kieee
+ifeq ($(OSNAME), Linux)
+ifeq ($(ARCH), x86_64)
+FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+ifeq ($(FLANG_VENDOR),AOCC)
+FCOMMON_OPT += -fno-unroll-loops
+endif
+endif
+endif
 ifdef BINARY64
 ifdef INTERFACE64
 ifneq ($(INTERFACE64), 0)
@@ -860,7 +944,7 @@ ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT += -i8
 endif
 endif
-FCOMMON_OPT += -recursive
+FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -fopenmp
 endif
@@ -900,8 +984,19 @@ ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT += -i8
 endif
 endif
+ifeq ($(ARCH), x86_64)
 FCOMMON_OPT += -tp p7-64
 else
+ifeq ($(ARCH), power)
+ifeq ($(CORE), POWER8)
+FCOMMON_OPT += -tp pwr8
+endif
+ifeq ($(CORE), POWER9)
+FCOMMON_OPT += -tp pwr9
+endif
+endif
+endif
+else
 FCOMMON_OPT += -tp p7
 endif
 FCOMMON_OPT += -Mrecursive
@@ -1129,6 +1224,10 @@ ifeq ($(USE_TLS), 1)
 CCOMMON_OPT += -DUSE_TLS
 endif
 
+ifeq ($(BUILD_HALF), 1)
+CCOMMON_OPT += -DBUILD_HALF
+endif
+
 CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
 
 ifndef SYMBOLPREFIX
@@ -1155,6 +1254,9 @@ KERNELDIR	= $(TOPDIR)/kernel/$(ARCH)
 
 include $(TOPDIR)/Makefile.$(ARCH)
 
+ifneq ($(C_COMPILER), PGI)
+CCOMMON_OPT     += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
+endif
 CCOMMON_OPT	+= -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
 
 ifeq ($(CORE), PPC440)
@@ -1247,7 +1349,6 @@ endif
 
 override CFLAGS     += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
 override PFLAGS     += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
-
 override FFLAGS     += $(COMMON_OPT) $(FCOMMON_OPT)
 override FPFLAGS    += $(FCOMMON_OPT) $(COMMON_PROF)
 #MAKEOVERRIDES =
@@ -1354,6 +1455,7 @@ export ARCH
 export CORE
 export LIBCORE
 export __BYTE_ORDER__
+export ELF_VERSION
 export PGCPATH
 export CONFIG
 export CC
@@ -1399,6 +1501,7 @@ export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
 export NO_AVX512
+export BUILD_HALF
 
 export SHGEMM_UNROLL_M
 export SHGEMM_UNROLL_N
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index f2de51ef4..00975b25a 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -27,18 +27,54 @@ endif
 endif
 endif
 
+ifeq ($(CORE), COOPERLAKE)
+ifndef DYNAMIC_ARCH
+ifndef NO_AVX512
+ifeq ($(C_COMPILER), GCC)
+# cooperlake support was added in 10.1
+GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
+ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
+CCOMMON_OPT += -march=cooperlake
+FCOMMON_OPT += -march=cooperlake
+endif
+endif
+ifeq ($(OSNAME), CYGWIN_NT)
+CCOMMON_OPT += -fno-asynchronous-unwind-tables
+FCOMMON_OPT += -fno-asynchronous-unwind-tables
+endif
+ifeq ($(OSNAME), WINNT)
+ifeq ($(C_COMPILER), GCC)
+CCOMMON_OPT += -fno-asynchronous-unwind-tables
+FCOMMON_OPT += -fno-asynchronous-unwind-tables
+endif
+endif
+endif
+endif
+endif
+
 ifeq ($(CORE), HASWELL)
 ifndef DYNAMIC_ARCH
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
+# AVX2 support was added in 4.7.0
+GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
+ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
 CCOMMON_OPT += -mavx2
 endif
+endif
 ifeq ($(F_COMPILER), GFORTRAN)
+# AVX2 support was added in 4.7.0
+GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
+ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
 FCOMMON_OPT += -mavx2
 endif
 endif
 endif
 endif
+endif
 
 
 
diff --git a/Makefile.zarch b/Makefile.zarch
index 47ea1eb71..be1e34f6d 100644
--- a/Makefile.zarch
+++ b/Makefile.zarch
@@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
 endif
 
 ifeq ($(CORE), Z14)
-CCOMMON_OPT += -march=z14 -mzvector
+CCOMMON_OPT += -march=z14 -mzvector -O3
 FCOMMON_OPT += -march=z14 -mzvector
 endif
diff --git a/README.md b/README.md
index 6dc3c7b42..f8226f5cb 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
 ## Installation from Source
 
 Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
-using Git from https://github.com/xianyi/OpenBLAS.git.
+using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
+sure to use the develop branch - master is several years out of date due to a change of maintainership.)
 Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
 Most can also be given directly on the make or cmake command line.
 
@@ -58,6 +59,10 @@ Examples:
   ```sh
   make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
   ```
+  or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
+  ```sh
+  make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
+  ```
 
 * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
   ```sh
diff --git a/TargetList.txt b/TargetList.txt
index e2d2f4026..5934f3012 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -22,6 +22,7 @@ SANDYBRIDGE
 HASWELL
 SKYLAKEX
 ATOM
+COOPERLAKE
 
 b)AMD CPU:
 ATHLON
@@ -49,6 +50,7 @@ POWER6
 POWER7
 POWER8
 POWER9
+POWER10
 PPCG4
 PPC970
 PPC970MP
@@ -95,6 +97,7 @@ FALKOR
 THUNDERX
 THUNDERX2T99
 TSV110
+THUNDERX3T110
 
 9.System Z:
 ZARCH_GENERIC
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 90d903ad7..2f70ceaf3 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -49,6 +49,12 @@ else
 GOTO_LAPACK_TARGETS=
 endif
 
+ifeq ($(BUILD_HALF),1)
+GOTO_HALF_TARGETS=shgemm.goto
+else
+GOTO_HALF_TARGETS=
+endif
+
 ifeq ($(OSNAME), WINNT)
 
 goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
        spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
        ssymm.goto dsymm.goto csymm.goto zsymm.goto \
-       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
+       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
        samin.goto damin.goto camin.goto zamin.goto \
        smin.goto dmin.goto \
        saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
-       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
+       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
 ##################################### Sgemm ####################################################
+ifeq ($(BUILD_HALF),1)
+shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+endif
+
 sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
@@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX)
 
 ##################################### Sgeev ####################################################
 sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 sgeev.acml : sgeev.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX)
 
 ##################################### Dgeev ####################################################
 dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 dgeev.acml : dgeev.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX)
 ##################################### Cgeev ####################################################
 
 cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 cgeev.acml : cgeev.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX)
 ##################################### Zgeev ####################################################
 
 zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 zgeev.acml : zgeev.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX)
 
 ##################################### Sgetri ####################################################
 sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 sgetri.acml : sgetri.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX)
 
 ##################################### Dgetri ####################################################
 dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 dgetri.acml : dgetri.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX)
 ##################################### Cgetri ####################################################
 
 cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 cgetri.acml : cgetri.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX)
 ##################################### Zgetri ####################################################
 
 zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 zgetri.acml : zgetri.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
 zcholesky.$(SUFFIX) : cholesky.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+ifeq ($(BUILD_HALF),1)
+shgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
+endif
+
 sgemm.$(SUFFIX) : gemm.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
 
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index dd016a7c3..d2235330b 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef DOUBLE
 #define GEMM   BLASFUNC(dgemm)
+#elif defined(HALF)
+#define GEMM   BLASFUNC(shgemm)
 #else
 #define GEMM   BLASFUNC(sgemm)
 #endif
@@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
 
 int main(int argc, char *argv[]){
 
-  FLOAT *a, *b, *c;
+  IFLOAT *a, *b;
+  FLOAT *c;
   FLOAT alpha[] = {1.0, 0.0};
   FLOAT beta [] = {0.0, 0.0};
   char transa = 'N';
@@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
     k = to;
   }
 
-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
+  if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
-  if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
+  if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
   if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
@@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
 #endif
 
   for (i = 0; i < m * k * COMPSIZE; i++) {
-    a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+    a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
   }
   for (i = 0; i < k * n * COMPSIZE; i++) {
-    b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+    b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
   }
   for (i = 0; i < m * n * COMPSIZE; i++) {
     c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
diff --git a/benchmark/zdot.c b/benchmark/zdot.c
index ed9d4d2e8..136135c9c 100644
--- a/benchmark/zdot.c
+++ b/benchmark/zdot.c
@@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
     	gettimeofday( &start, (struct timezone *)0);
-
+#ifdef RETURN_BY_STACK
+    	DOT (&result , &m, x, &inc_x, y, &inc_y );
+#else
     	result = DOT (&m, x, &inc_x, y, &inc_y );
-
+#endif
     	gettimeofday( &stop, (struct timezone *)0);
 
     	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
diff --git a/c_check b/c_check
index c7899c84f..314c2b157 100644
--- a/c_check
+++ b/c_check
@@ -6,6 +6,7 @@
 # Checking cross compile
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
 $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
+$hostarch = `uname -p` if ($hostos eq "AIX");
 $hostarch = "x86_64" if ($hostarch eq "amd64");
 $hostarch = "arm" if ($hostarch =~ /^arm.*/);
 $hostarch = "arm64" if ($hostarch eq "aarch64");
@@ -248,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
     }
 }
 
+$c11_atomics = 0;
+if ($data =~ /HAVE_C11/) {
+    eval "use File::Temp qw(tempfile)";
+    if ($@){ 
+       warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
+       $c11_atomics = 0;
+    } else {
+       ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
+       print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
+       $args = " -c -o $tmpf.o $tmpf";
+       my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+       system(@cmd) == 0;
+       if ($? != 0) {
+           $c11_atomics = 0;
+       } else {
+           $c11_atomics = 1;
+       }
+       unlink("$tmpf.o");
+    }
+}
+
+
 $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
 
 $data =~ /globl\s([_\.]*)(.*)/;
@@ -310,6 +333,7 @@ $linker_a = "";
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
 	    && ($flags !~ /omp/)
+	    && ($flags !~ /[0-9]+/)
 	    ) {
 	    $linker_l .= $flags . " "
 	}
@@ -350,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n"  if $binformat eq bin32;
 print CONFFILE "#define __64BIT__\t1\n"  if $binformat eq bin64;
 print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
 print CONFFILE "#define HAVE_MSA\t1\n"  if $have_msa eq 1;
+print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
+
 
 if ($os eq "LINUX") {
 
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 9d51f777c..c00f8fe71 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -45,11 +45,11 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
+    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
   endif ()
   
   if (POWER)
-    set(DYNAMIC_CORE POWER6 POWER8 POWER9)
+	  set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
   endif ()
   
   if (X86)
@@ -76,9 +76,9 @@ if (DYNAMIC_ARCH)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
     endif ()
     if (NOT NO_AVX512)
-      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
+      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
       string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-   endif ()
+    endif ()
     if (DYNAMIC_LIST)
 	set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
     endif ()
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index d5551147c..c490dd9ab 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -103,3 +103,16 @@ if (${CORE} STREQUAL "SKYLAKEX")
     endif ()
   endif ()
 endif ()
+
+if (${CORE} STREQUAL "COOPERLAKE")
+  if (NOT DYNAMIC_ARCH)
+    if (NOT NO_AVX512)
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+        set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake")
+      else ()
+        set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
+      endif()  
+    endif ()
+  endif ()
+endif ()
diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake
index f877fc3e1..0f5d0e15d 100644
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -21,7 +21,15 @@
 # NEED2UNDERSCORES
 
 if (NOT NO_LAPACK)
-  enable_language(Fortran)
+  include(CheckLanguage)
+  check_language(Fortran)
+  if(CMAKE_Fortran_COMPILER)
+    enable_language(Fortran)
+  else()
+  message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
+  set (NOFORTRAN 1)
+  set (NO_LAPACK 1)
+  endif()
 else()
   include(CMakeForceCompiler)
   CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index cc330ae2c..fc1f9bb22 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG")
   if (USE_OPENMP)
     set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
   endif ()
+  set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
 endif ()
 
 if (${F_COMPILER} STREQUAL "G77")
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 1c1fed571..4b505a102 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -113,6 +113,7 @@ macro(SetDefaultL1)
   set(ZSUMKERNEL zsum.S)
   set(QSUMKERNEL sum.S)
   set(XSUMKERNEL zsum.S)
+if (BUILD_HALF)
   set(SHAMINKERNEL ../arm/amin.c)
   set(SHAMAXKERNEL ../arm/amax.c)
   set(SHMAXKERNEL ../arm/max.c)
@@ -131,6 +132,7 @@ macro(SetDefaultL1)
   set(SHNRM2KERNEL ../arm/nrm2.c)
   set(SHSUMKERNEL ../arm/sum.c)
   set(SHSWAPKERNEL ../arm/swap.c)
+endif ()
 endmacro ()
 
 macro(SetDefaultL2)
@@ -179,10 +181,11 @@ macro(SetDefaultL2)
   set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
   set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
   set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
+if (BUILD_HALF)
   set(SHGEMVNKERNEL ../arm/gemv_n.c)
   set(SHGEMVTKERNEL ../arm/gemv_t.c)
   set(SHGERKERNEL ../generic/ger.c)
-
+endif ()
 endmacro ()
 
 macro(SetDefaultL3)
@@ -190,6 +193,7 @@ macro(SetDefaultL3)
   set(DGEADD_KERNEL ../generic/geadd.c)
   set(CGEADD_KERNEL ../generic/zgeadd.c)
   set(ZGEADD_KERNEL ../generic/zgeadd.c)
+if (BUILD_HALF)
   set(SHGEADD_KERNEL ../generic/geadd.c)
   set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c)
   set(SHGEMM_BETA  ../generic/gemm_beta.c)
@@ -201,6 +205,6 @@ macro(SetDefaultL3)
   set(SHGEMMITCOPYOBJ shgemm_itcopy.o)
   set(SHGEMMONCOPYOBJ shgemm_oncopy.o)
   set(SHGEMMOTCOPYOBJ shgemm_otcopy.o)
-
+endif ()
 
 endmacro ()
diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index df4b2ab06..0bd49f996 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -7,5 +7,5 @@ Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OPENBLAS_VERSION@
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -lopenblas${libsuffix}
+Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
 Cflags: -I${includedir}
diff --git a/cmake/os.cmake b/cmake/os.cmake
index 2d25e7aaa..c644bc3f7 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
   set(NO_EXPRECISION 1)
 endif ()
 
-if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
+if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
   set(EXTRALIB "${EXTRALIB} -lm")
   set(NO_EXPRECISION 1)
 endif ()
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index e0696093b..3b2a9d6a2 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define HAVE_VFP\n"
       "#define HAVE_NEON\n"
       "#define ARMV8\n")
+if ("${TCORE}" STREQUAL "CORTEXA57")     
     set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 4)
+else ()
+    set(SGEMM_UNROLL_M 8)
+    set(SGEMM_UNROLL_N 8) 
+endif ()
     set(DGEMM_UNROLL_M 8)
     set(DGEMM_UNROLL_N 4)
     set(CGEMM_UNROLL_M 8)
@@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define THUNDERX3T110\n"
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t8\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t8\n"
+      "#define L2_SIZE\t524288\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define L3_SIZE\t94371840\n"
+      "#define L3_LINESIZE\t64\n"
+      "#define L3_ASSOCIATIVE\t32\n"
+      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "TSV110")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define ARMV8\n"
@@ -420,7 +452,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(ZGEMM_UNROLL_M 8)
     set(ZGEMM_UNROLL_N 2)
     set(SYMV_P 8)
-  elseif ("${TCORE}" STREQUAL "POWER9")
+  elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE 32768\n"
       "#define L1_DATA_LINESIZE 128\n"
@@ -492,7 +524,7 @@ else(NOT CMAKE_CROSSCOMPILING)
   if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
     try_compile(GETARCH_RESULT ${GETARCH_DIR}
       SOURCES ${GETARCH_SRC}
-    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
       OUTPUT_VARIABLE GETARCH_LOG
       COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
     )
@@ -520,7 +552,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
   if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
     try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
       SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
-    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
+    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
       OUTPUT_VARIABLE GETARCH2_LOG
       COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
     )
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 65e5aa508..e3617c4e2 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,7 +33,7 @@ endif ()
 if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
   set(NO_AVX 1)
-  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
+  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
     set(TARGET "NEHALEM")
   endif ()
   if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@@ -45,6 +45,18 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
 endif ()
 
 if (DEFINED TARGET)
+  if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
+#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+#    endif()    
+  endif()
   if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
   endif()
@@ -297,6 +309,16 @@ if (USE_SIMPLE_THREADED_LEVEL3)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
 endif ()
 
+if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+if (DEFINED MAX_STACK_ALLOC)
+if (NOT ${MAX_STACK_ALLOC} EQUAL 0)
+set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}")
+endif ()
+else ()
+set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
+endif ()
+endif ()
+
 if (DEFINED LIBNAMESUFFIX)
   set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
 else ()
@@ -407,6 +429,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows
   set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
 endif ()
 
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+if ("${F_COMPILER}" STREQUAL "FLANG")
+if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
+  set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
+endif ()
+endif ()
+endif ()
+
 if (NOT DEFINED SUFFIX)
   set(SUFFIX o)
 endif ()
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 94eb0a9c6..4382ffc4e 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
 endif()
   file(REMOVE "avx512.c" "avx512.o")
 endif()
+
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
+if (HAVE_C11 EQUAL 1)
+message (STATUS found stdatomic.h)
+set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11")
+endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 831ddffe6..1c21e776e 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -15,12 +15,36 @@ endfunction ()
 # Reads a Makefile into CMake vars.
 macro(ParseMakefileVars MAKEFILE_IN)
   message(STATUS "Reading vars from ${MAKEFILE_IN}...")
+        set (IfElse 0)
+        set (ElseSeen 0)
   file(STRINGS ${MAKEFILE_IN} makefile_contents)
   foreach (makefile_line ${makefile_contents})
+#message(STATUS "parsing ${makefile_line}")
+    if (${IfElse} GREATER 0)
+      string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
+      if (NOT "${line_match}" STREQUAL "")
+#           message(STATUS "ENDIF ${makefile_line}")
+        set (IfElse 0)
+        set (ElseSeen 0)
+        continue ()
+      endif ()
+      string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
+        if (NOT "${line_match}" STREQUAL "") 
+#           message(STATUS "ELSE ${makefile_line}")
+           set (ElseSeen 1)        
+           continue ()  
+        endif()
+      if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
+#           message(STATUS "skipping ${makefile_line}")
+         continue ()
+      endif ()    
+    endif ()    
     string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
     if (NOT "${line_match}" STREQUAL "")
+#message(STATUS "match on ${line_match}")
       set(var_name ${CMAKE_MATCH_1})
-      set(var_value ${CMAKE_MATCH_2})
+#      set(var_value ${CMAKE_MATCH_2})
+      string(STRIP ${CMAKE_MATCH_2} var_value)
       # check for Makefile variables in the string, e.g. $(TSUFFIX)
       string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
       foreach (make_var ${make_var_matches})
@@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN)
     else ()
       string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
       if (NOT "${line_match}" STREQUAL "")
+#message(STATUS "match on include ${line_match}")
         ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
+      else ()
+#        message(STATUS "unmatched line ${line_match}")
+        string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
+        if (NOT "${line_match}" STREQUAL "")
+#          message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
+          if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
+#            message (STATUS "condition is true")
+            set (IfElse 1)
+          else ()
+            set (IfElse 2)
+          endif ()
+        else ()
+          string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
+          if (NOT "${line_match}" STREQUAL "")
+#            message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
+            if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
+#              message (STATUS "condition is true")
+              set (IfElse 1)
+            else ()
+              set (IfElse 2)
+            endif ()
+          endif ()
+        endif ()
       endif ()
     endif ()
   endforeach ()
diff --git a/common.h b/common.h
index e2c8cdee5..d6637abe4 100644
--- a/common.h
+++ b/common.h
@@ -360,13 +360,8 @@ typedef int blasint;
 #endif
 #endif
 
-#ifdef POWER8
-#ifndef YIELDING
-#define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
-#endif
-#endif
 
-#ifdef POWER9
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #ifndef YIELDING
 #define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
 #endif
@@ -686,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void);
 __declspec(dllimport) int __cdecl omp_get_num_procs(void);
 #endif
 
-#if (__STDC_VERSION__ >= 201112L)
+#ifdef HAVE_C11
 #if defined(C_GCC) && ( __GNUC__ < 7) 
 // workaround for GCC bug 65467
 #ifndef _Atomic
diff --git a/common_level3.h b/common_level3.h
index 6d5cfa65d..88a521b73 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
 extern "C" {
 #endif
 
-extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
+void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,
 	float * A, BLASLONG strideA,
 	float * B, BLASLONG strideB,
 	float * R, BLASLONG strideR);
 
-extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
+int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
 
 
 int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
diff --git a/common_mips.h b/common_mips.h
index dd2f8d558..7dc3ba246 100644
--- a/common_mips.h
+++ b/common_mips.h
@@ -94,7 +94,7 @@ REALNAME:
 #endif
 #define HUGE_PAGESIZE   ( 4 << 20)
 
-#define BUFFER_SIZE     (16 << 20)
+#define BUFFER_SIZE     (16 << 21)
 
 
 #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
diff --git a/common_mips64.h b/common_mips64.h
index af638d60c..a06edfe08 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -227,7 +227,7 @@ REALNAME: ;\
 
 #define SEEK_ADDRESS
 
-#define BUFFER_SIZE     ( 32 << 20)
+#define BUFFER_SIZE     ( 32 << 21)
 
 #if defined(LOONGSON3A)
 #define PAGESIZE	(16UL << 10)
diff --git a/common_param.h b/common_param.h
index 19a34fa3d..0437482dc 100644
--- a/common_param.h
+++ b/common_param.h
@@ -47,7 +47,7 @@ typedef struct {
   int dtb_entries;
   int offsetA, offsetB, align;
 
-#if 1
+#ifdef BUILD_HALF
   int shgemm_p, shgemm_q, shgemm_r;
   int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
 
@@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
   int    (*ssymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
   int    (*ssymv_U) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
 
+#ifdef ARCH_X86_64
+  void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
+  int  (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
+#endif
+  
   int    (*sgemm_kernel   )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
   int    (*sgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float  *, BLASLONG);
 
@@ -1002,12 +1007,14 @@ extern gotoblas_t *gotoblas;
 
 #define HAVE_EX_L2	gotoblas -> exclusive_cache
 
+#ifdef BUILD_HALF
 #define	SHGEMM_P		gotoblas -> shgemm_p
 #define	SHGEMM_Q		gotoblas -> shgemm_q
 #define	SHGEMM_R		gotoblas -> shgemm_r
 #define	SHGEMM_UNROLL_M	gotoblas -> shgemm_unroll_m
 #define	SHGEMM_UNROLL_N	gotoblas -> shgemm_unroll_n
 #define	SHGEMM_UNROLL_MN	gotoblas -> shgemm_unroll_mn
+#endif
 
 #define	SGEMM_P		gotoblas -> sgemm_p
 #define	SGEMM_Q		gotoblas -> sgemm_q
@@ -1086,6 +1093,7 @@ extern gotoblas_t *gotoblas;
 #define HAVE_EX_L2	0
 #endif
 
+#ifdef BUILD_HALF
 #define	SHGEMM_P		SHGEMM_DEFAULT_P
 #define	SHGEMM_Q		SHGEMM_DEFAULT_Q
 #define	SHGEMM_R		SHGEMM_DEFAULT_R
@@ -1096,6 +1104,7 @@ extern gotoblas_t *gotoblas;
 #else
 #define SHGEMM_UNROLL_MN	MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N))
 #endif
+#endif
 
 #define	SGEMM_P		SGEMM_DEFAULT_P
 #define	SGEMM_Q		SGEMM_DEFAULT_Q
@@ -1330,31 +1339,31 @@ extern gotoblas_t *gotoblas;
 #endif
 
 #ifndef SHGEMM_DEFAULT_R
-#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q *  4) - 15) & ~15)
+#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q *  4) - 15) & ~15UL)
 #endif
 
 #ifndef SGEMM_DEFAULT_R
-#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q *  4) - 15) & ~15)
+#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q *  4) - 15) & ~15UL)
 #endif
 
 #ifndef DGEMM_DEFAULT_R
-#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q *  8) - 15) & ~15)
+#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q *  8) - 15) & ~15UL)
 #endif
 
 #ifndef QGEMM_DEFAULT_R
-#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15)
+#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
 #endif
 
 #ifndef CGEMM_DEFAULT_R
-#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q *  8) - 15) & ~15)
+#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q *  8) - 15) & ~15UL)
 #endif
 
 #ifndef ZGEMM_DEFAULT_R
-#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15)
+#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
 #endif
 
 #ifndef XGEMM_DEFAULT_R
-#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15)
+#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL)
 #endif
 
 #ifndef SNUMOPT
diff --git a/common_power.h b/common_power.h
index e29d0f382..e0685f760 100644
--- a/common_power.h
+++ b/common_power.h
@@ -68,7 +68,7 @@
 #endif
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #define MB		__asm__ __volatile__ ("eieio":::"memory")
 #define WMB		__asm__ __volatile__ ("eieio":::"memory")
 #define RMB		__asm__ __volatile__ ("eieio":::"memory")
@@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
 	   "	bne- 1f\n"
 	   "	stwcx. %2,0, %1\n"
 	   "	bne- 0b\n"
+	   "    isync\n"
 	   "1:    "
 	: "=&r"(ret)
 	: "r"(address), "r" (val)
@@ -272,7 +273,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970)
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8
@@ -294,7 +295,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define L1_PREFETCH	dcbtst
 #endif
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #define L1_DUALFETCH
 #define L1_PREFETCHSIZE (16 + 128 * 100)
 #define L1_PREFETCH	dcbtst
@@ -843,7 +844,7 @@ Lmcount$lazy_ptr:
 #define BUFFER_SIZE     (  2 << 20)
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
-#elif defined(POWER8) || defined(POWER9)
+#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
 #define BUFFER_SIZE     ( 64 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
diff --git a/common_s.h b/common_s.h
index 270d55e9a..685d73062 100644
--- a/common_s.h
+++ b/common_s.h
@@ -45,6 +45,10 @@
 #define SSYMV_THREAD_U		ssymv_thread_U
 #define SSYMV_THREAD_L		ssymv_thread_L
 
+
+#define SGEMM_DIRECT_PERFORMANT    sgemm_direct_performant
+#define SGEMM_DIRECT		sgemm_direct
+
 #define	SGEMM_ONCOPY		sgemm_oncopy
 #define	SGEMM_OTCOPY		sgemm_otcopy
 
@@ -214,6 +218,14 @@
 #define SSYMV_THREAD_U		ssymv_thread_U
 #define SSYMV_THREAD_L		ssymv_thread_L
 
+#ifdef ARCH_X86_64
+#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
+#define  SGEMM_DIRECT		gotoblas -> sgemm_direct
+#else
+#define SGEMM_DIRECT_PERFORMANT    sgemm_direct_performant
+#define  SGEMM_DIRECT		sgemm_direct
+#endif
+
 #define	SGEMM_ONCOPY		gotoblas -> sgemm_oncopy
 #define	SGEMM_OTCOPY		gotoblas -> sgemm_otcopy
 #define	SGEMM_INCOPY		gotoblas -> sgemm_incopy
diff --git a/common_thread.h b/common_thread.h
index 6ec40e096..ec0c65b22 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -132,18 +132,18 @@ extern int blas_server_avail;
 static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
-	int openmp_nthreads=0;
+	int openmp_nthreads=omp_get_max_threads();
 #endif
 
+#ifndef USE_OPENMP 
   if (blas_cpu_number == 1
-
-#ifdef USE_OPENMP
-      || omp_in_parallel()
 #endif
-      ) return 1;
+#ifdef USE_OPENMP
+     if (openmp_nthreads == 1 || omp_in_parallel()
+#endif
+      ) return 1;        
 
 #ifdef USE_OPENMP
-  openmp_nthreads=omp_get_max_threads();
   if (blas_cpu_number != openmp_nthreads) {
 	  goto_set_num_threads(openmp_nthreads);
   }
diff --git a/common_x86_64.h b/common_x86_64.h
index 0247674cd..bee7e8cdb 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
 #endif
 
   do {
-    while (*address) {YIELDING;};
+    while (*address) {YIELDING;}
 
 #ifndef C_MSVC
     __asm__ __volatile__(
@@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){
 #else
 extern unsigned int blas_quick_divide_table[];
 
-static __inline int blas_quickdivide(unsigned int x, unsigned int y){
+static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
 
-  unsigned int result;
+  volatile unsigned int result;
 
   if (y <= 1) return x;
 
@@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
   y = blas_quick_divide_table[y];
 
   __asm__ __volatile__  ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
-
   return result;
 }
 #endif
@@ -229,14 +228,8 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #define HUGE_PAGESIZE	( 2 << 20)
 
 #ifndef BUFFERSIZE
-#if defined(SKYLAKEX) 
-#define BUFFER_SIZE	(32 << 21)
-#elif defined(HASWELL) || defined(ZEN)
 #define BUFFER_SIZE	(32 << 22)
 #else
-#define BUFFER_SIZE	(32 << 20)
-#endif
-#else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif
 
diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h
index 60ab5bb2f..8005369a8 100644
--- a/cpp_thread_test/cpp_thread_safety_common.h
+++ b/cpp_thread_test/cpp_thread_safety_common.h
@@ -5,6 +5,14 @@ inline void pauser(){
     std::getline(std::cin, dummy);
 }
 
+void FailIfThreadsAreZero(uint32_t numConcurrentThreads) {
+	if(numConcurrentThreads == 0) {
+		std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<<std::endl;
+		std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
+		exit(-1);
+	}
+}
+
 void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
 	for(uint32_t i=0; i<numMat; i++){
 		for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp
index 1c5287524..104c64f2a 100644
--- a/cpp_thread_test/dgemm_thread_safety.cpp
+++ b/cpp_thread_test/dgemm_thread_safety.cpp
@@ -46,6 +46,8 @@ int main(int argc, char* argv[]){
 	std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
 	std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
 	std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+
+	FailIfThreadsAreZero(numConcurrentThreads);
 	
 	std::cout<<"Initializing random number generator..."<<std::flush;
 	std::mt19937_64 PRNG = InitPRNG();
diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp
index 5411fec29..20ea38138 100644
--- a/cpp_thread_test/dgemv_thread_safety.cpp
+++ b/cpp_thread_test/dgemv_thread_safety.cpp
@@ -18,7 +18,7 @@ int main(int argc, char* argv[]){
 	uint32_t maxHwThreads = omp_get_max_threads();
 	
 	if (maxHwThreads < 52)
-		numConcurrentThreads = maxHwThreads -4;
+		numConcurrentThreads = maxHwThreads;
 	
 	if (argc > 4){
 		std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
@@ -47,6 +47,8 @@ int main(int argc, char* argv[]){
 	std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
 	std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
 	std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+
+	FailIfThreadsAreZero(numConcurrentThreads);
 	
 	std::cout<<"Initializing random number generator..."<<std::flush;
 	std::mt19937_64 PRNG = InitPRNG();
diff --git a/cpuid.h b/cpuid.h
index 697f43133..824e0bc70 100644
--- a/cpuid.h
+++ b/cpuid.h
@@ -118,6 +118,7 @@
 #define CORE_ZEN         27
 #define CORE_SKYLAKEX    28
 #define CORE_DHYANA	 29
+#define CORE_COOPERLAKE  30
 
 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@@ -137,11 +138,12 @@
 #define HAVE_MISALIGNSSE (1 << 15)
 #define HAVE_128BITFPU   (1 << 16)
 #define HAVE_FASTMOVU    (1 << 17)
-#define HAVE_AVX      (1 <<  18)
-#define HAVE_FMA4     (1 <<  19)
-#define HAVE_FMA3     (1 <<  20)
-#define HAVE_AVX512VL (1 <<  21)
-#define HAVE_AVX2     (1 <<  22)
+#define HAVE_AVX      (1 << 18)
+#define HAVE_FMA4     (1 << 19)
+#define HAVE_FMA3     (1 << 20)
+#define HAVE_AVX512VL (1 << 21)
+#define HAVE_AVX2     (1 << 22)
+#define HAVE_AVX512BF16  (1 << 23)
 
 #define CACHE_INFO_L1_I     1
 #define CACHE_INFO_L1_D     2
@@ -218,7 +220,8 @@ typedef struct {
 #define CPUTYPE_ZEN 			51
 #define CPUTYPE_SKYLAKEX		52
 #define CPUTYPE_DHYANA			53
+#define CPUTYPE_COOPERLAKE		54
 
-#define CPUTYPE_HYGON_UNKNOWN		54
+#define CPUTYPE_HYGON_UNKNOWN		99
 
 #endif
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 4103216e6..6f41be604 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -40,6 +40,7 @@
 // Cavium
 #define CPU_THUNDERX      7
 #define CPU_THUNDERX2T99  8
+#define CPU_THUNDERX3T110 12
 //Hisilicon
 #define CPU_TSV110        9
 // Ampere
@@ -57,7 +58,8 @@ static char *cpuname[] = {
   "THUNDERX2T99",
   "TSV110",
   "EMAG8180",
-  "NEOVERSEN1"
+  "NEOVERSEN1",
+  "THUNDERX3T110"
 };
 
 static char *cpuname_lower[] = {
@@ -72,7 +74,8 @@ static char *cpuname_lower[] = {
   "thunderx2t99",
   "tsv110",
   "emag8180",
-  "neoversen1"
+  "neoversen1",
+  "thunderx3t110"
 };
 
 int get_feature(char *search)
@@ -158,6 +161,8 @@ int detect(void)
 			return CPU_THUNDERX;
     else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
 			return CPU_THUNDERX2T99;
+    else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8"))
+			return CPU_THUNDERX3T110;
     // HiSilicon
     else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
                         return CPU_TSV110;
@@ -372,7 +377,25 @@ void get_cpuconfig(void)
     	printf("#define L2_LINESIZE 64\n");
     	printf("#define DTB_DEFAULT_ENTRIES 64\n");
     	printf("#define DTB_SIZE 4096\n");
+			break;
 
+		case CPU_THUNDERX3T110:
+			printf("#define THUNDERX3T110                 \n");
+			printf("#define L1_CODE_SIZE         65536    \n");
+			printf("#define L1_CODE_LINESIZE     64       \n");
+			printf("#define L1_CODE_ASSOCIATIVE  8        \n");
+			printf("#define L1_DATA_SIZE         32768    \n");
+			printf("#define L1_DATA_LINESIZE     64       \n");
+			printf("#define L1_DATA_ASSOCIATIVE  8        \n");
+			printf("#define L2_SIZE              524288   \n");
+			printf("#define L2_LINESIZE          64       \n");
+			printf("#define L2_ASSOCIATIVE       8        \n");
+			printf("#define L3_SIZE              94371840 \n");
+			printf("#define L3_LINESIZE          64       \n");
+			printf("#define L3_ASSOCIATIVE       32       \n");
+			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+			printf("#define DTB_SIZE             4096     \n");
+			break;
 	}
 	get_cpucount();
 }
diff --git a/cpuid_power.c b/cpuid_power.c
index d5ba6fb2c..b17493bc8 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -38,6 +38,7 @@
 
 #include  <sys/utsname.h>
 #ifdef _AIX
+#include <sys/systemcfg.h>
 #include <sys/vminfo.h>
 #endif
 #ifdef __APPLE__
@@ -57,6 +58,7 @@
 #define CPUTYPE_PPCG4	   7
 #define CPUTYPE_POWER8     8
 #define CPUTYPE_POWER9     9
+#define CPUTYPE_POWER10   10
 
 char *cpuname[] = {
   "UNKNOWN",
@@ -68,7 +70,8 @@ char *cpuname[] = {
   "CELL",
   "PPCG4",
   "POWER8",
-  "POWER9"
+  "POWER9",
+  "POWER10"
 };
 
 char *lowercpuname[] = {
@@ -81,7 +84,8 @@ char *lowercpuname[] = {
   "cell",
   "ppcg4",
   "power8",
-  "power9"	
+  "power9",	
+  "power10"	
 };
 
 char *corename[] = {
@@ -94,7 +98,8 @@ char *corename[] = {
   "CELL",
   "PPCG4",
   "POWER8",
-  "POWER9"   	
+  "POWER9",  	
+  "POWER10"   	
 };
 
 int detect(void){
@@ -125,6 +130,7 @@ int detect(void){
   if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
   if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
+  if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10;
   if (!strncasecmp(p, "Cell",   4)) return CPUTYPE_CELL;
   if (!strncasecmp(p, "7447",   4)) return CPUTYPE_PPCG4;
 
@@ -132,34 +138,19 @@ int detect(void){
 #endif
 
 #ifdef _AIX
-  FILE *infile;
-  char buffer[512], *p;
+  // Cast from int to unsigned to ensure comparisons work for all bits in
+  // the bit mask, even the top bit
+  unsigned implementation = (unsigned) _system_configuration.implementation;
 
-  p = (char *)NULL;
-  infile = popen("prtconf|grep 'Processor Type'", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("Pro", buffer, 3)){
-	p = strchr(buffer, ':') + 2;
-#if 0
-	fprintf(stderr, "%s\n", p);
-#endif
-	break;
-      }
-  }
-
-  pclose(infile);
-
-  if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
-  if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
-  if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
-  if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
-  if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
-  if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
-  if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
-  if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
-  if (!strncasecmp(p, "Cell",   4)) return CPUTYPE_CELL;
-  if (!strncasecmp(p, "7447",   4)) return CPUTYPE_PPCG4;
-  return CPUTYPE_POWER5;
+  if (implementation >= 0x40000u) return CPUTYPE_POWER10;
+  else if (implementation & 0x20000) return CPUTYPE_POWER9;
+  else if (implementation & 0x10000) return CPUTYPE_POWER8;
+  else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7
+  else if (implementation & 0x04000) return CPUTYPE_POWER6;
+  else if (implementation & 0x02000) return CPUTYPE_POWER5;
+  else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450
+  else if (implementation & 0x00800) return CPUTYPE_POWER4;
+  else return CPUTYPE_POWER3;
 #endif
 
 #ifdef __APPLE__
@@ -179,6 +170,9 @@ int detect(void){
 int id;
 __asm __volatile("mfpvr %0" : "=r"(id));
 switch ( id >> 16 ) {
+  case 0x80: // POWER10
+    return CPUTYPE_POWER10;
+    break;
   case 0x4e: // POWER9
     return CPUTYPE_POWER9;
     break;
diff --git a/cpuid_x86.c b/cpuid_x86.c
index e29adecae..728d459d1 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -249,6 +249,22 @@ int support_avx512(){
 #endif
 }
 
+int support_avx512_bf16(){
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx512())
+    return 0;
+  cpuid_count(7, 1, &eax, &ebx, &ecx, &edx);
+  if((eax & 32) == 32){
+      ret=1;  // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
 
 int get_vendor(void){
   int eax, ebx, ecx, edx;
@@ -335,6 +351,7 @@ int get_cputype(int gettype){
     if (support_avx()) feature |= HAVE_AVX;
     if (support_avx2()) feature |= HAVE_AVX2;
     if (support_avx512()) feature |= HAVE_AVX512VL;
+    if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
     if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
 #endif
 
@@ -1337,6 +1354,8 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	case 5:
 	  // Skylake X
+          if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;
           if(support_avx512())
             return CPUTYPE_SKYLAKEX;
           if(support_avx2())
@@ -1406,6 +1425,17 @@ int get_cpuname(void){
 	    return CPUTYPE_SANDYBRIDGE;
           else
 	    return CPUTYPE_NEHALEM;
+    }
+      case 10: //family 6 exmodel 10
+        switch (model) {
+    case 5: // Comet Lake H and S
+    case 6: // Comet Lake U
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+        return CPUTYPE_SANDYBRIDGE;
+          else
+        return CPUTYPE_NEHALEM;
 	}
 	break;    
       }
@@ -1443,10 +1473,11 @@ int get_cpuname(void){
 	return CPUTYPE_OPTERON;
       case  1:
       case  3:
-      case  7:
-      case 10:
+//      case  7:
+//      case 10:
 	return CPUTYPE_BARCELONA;
       case  5:
+      case  7:		      
 	return CPUTYPE_BOBCAT;
       case  6:
 	switch (model) {
@@ -1496,6 +1527,8 @@ int get_cpuname(void){
 	  // AMD Ryzen
 	case 8:
 	  // AMD Ryzen2
+	default:
+	  // Matisse/Renoir and other recent Ryzen2		
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CPUTYPE_ZEN;
@@ -1505,6 +1538,16 @@ int get_cpuname(void){
 	  else
 	    return CPUTYPE_BARCELONA;
         }
+	break;	      
+      case 10: // Zen3		      
+	if(support_avx())
+#ifndef NO_AVX2
+	    return CPUTYPE_ZEN;
+#else
+	    return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
+#endif
+	  else
+	    return CPUTYPE_BARCELONA;	      
       }
       break;
     }
@@ -1653,7 +1696,8 @@ static char *cpuname[] = {
   "EXCAVATOR",
   "ZEN",
   "SKYLAKEX",
-  "DHYANA"
+  "DHYANA",
+  "COOPERLAKE"
 };
 
 static char *lowercpuname[] = {
@@ -1709,7 +1753,8 @@ static char *lowercpuname[] = {
   "excavator",
   "zen",
   "skylakex",
-  "dhyana"
+  "dhyana",
+  "cooperlake"
 };
 
 static char *corename[] = {
@@ -1742,7 +1787,8 @@ static char *corename[] = {
   "EXCAVATOR",
   "ZEN",
   "SKYLAKEX",
-  "DHYANA"
+  "DHYANA",
+  "COOPERLAKE"
 };
 
 static char *corename_lower[] = {
@@ -1775,7 +1821,8 @@ static char *corename_lower[] = {
   "excavator",
   "zen",
   "skylakex",
-  "dhyana"
+  "dhyana",
+  "cooperlake"
 };
 
 
@@ -1955,6 +2002,19 @@ int get_coretype(void){
 	    return CORE_NEHALEM;
         }
         break;
+      case 10:
+        switch (model) {
+	  case 5: // Comet Lake H and S
+    	  case 6: // Comet Lake U
+            if(support_avx())
+  #ifndef NO_AVX2
+              return CORE_HASWELL;
+  #else
+              return CORE_SANDYBRIDGE;
+  #endif
+            else
+              return CORE_NEHALEM;
+        }
       case 5:
         switch (model) {
 	case 6:
@@ -1970,7 +2030,9 @@ int get_coretype(void){
 	case 5:
 	 // Skylake X
 #ifndef NO_AVX512
-	    return CORE_SKYLAKEX;
+          if(support_avx512_bf16())
+            return CORE_COOPERLAKE;
+	  return CORE_SKYLAKEX;
 #else
 	  if(support_avx())
 #ifndef NO_AVX2
@@ -2083,7 +2145,7 @@ int get_coretype(void){
 	    return CORE_PILEDRIVER;
 	  else
 	    return CORE_BARCELONA; //OS don't support AVX.
-    case 5: // New EXCAVATOR
+        case 5: // New EXCAVATOR
 	  if(support_avx())
 	    return CORE_EXCAVATOR;
 	  else
@@ -2111,12 +2173,14 @@ int get_coretype(void){
 	  }
 	  break;
 	}
-      } else if (exfamily == 8) {
+      } else if (exfamily == 8 || exfamily == 10) {
 	switch (model) {
 	case 1:
 	  // AMD Ryzen
 	case 8:
-	  // Ryzen 2		
+	  // Ryzen 2
+	default:
+	  // Matisse,Renoir Ryzen2 models		
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CORE_ZEN;
@@ -2237,6 +2301,7 @@ void get_cpuconfig(void){
     if (features & HAVE_AVX )    printf("#define HAVE_AVX\n");
     if (features & HAVE_AVX2 )    printf("#define HAVE_AVX2\n");
     if (features & HAVE_AVX512VL )    printf("#define HAVE_AVX512VL\n");
+    if (features & HAVE_AVX512BF16 )    printf("#define HAVE_AVX512BF16\n");
     if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
     if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n");
     if (features & HAVE_FMA4 )    printf("#define HAVE_FMA4\n");
@@ -2307,6 +2372,7 @@ void get_sse(void){
   if (features & HAVE_AVX )    printf("HAVE_AVX=1\n");
   if (features & HAVE_AVX2 )    printf("HAVE_AVX2=1\n");
   if (features & HAVE_AVX512VL )    printf("HAVE_AVX512VL=1\n");
+  if (features & HAVE_AVX512BF16 )    printf("HAVE_AVX512BF16=1\n");
   if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
   if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n");
   if (features & HAVE_FMA4 )    printf("HAVE_FMA4=1\n");
diff --git a/ctest.c b/ctest.c
index 5e869b901..cd84ab1bb 100644
--- a/ctest.c
+++ b/ctest.c
@@ -153,3 +153,6 @@ ARCH_ARM
 ARCH_ARM64
 #endif
 
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+HAVE_C11
+#endif
diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index 881b4ee35..09a62d9bf 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS)
 USE_GEMM3M = 1
 endif
 
+ifeq ($(BUILD_HALF),1)
 SHBLASOBJS       += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
+endif
+
 SBLASOBJS	+= \
 	sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
 	strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
@@ -204,8 +207,9 @@ COMMONOBJS  += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(
 COMMONOBJS  += syrk_thread.$(SUFFIX)
 
 ifndef USE_SIMPLE_THREADED_LEVEL3
-
+ifeq ($(BUILD_HALF),1)
 SHBLASOBJS    += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
+endif
 SBLASOBJS    += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
 DBLASOBJS    += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 QBLASOBJS    += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index c6bbb9ca9..a38506585 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 9216daaed..39824fc5a 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -91,7 +91,7 @@
 #endif
 
 typedef struct {
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
   _Atomic
 #else
   volatile
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index 574f825b0..a041abac3 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -67,7 +67,7 @@
 #endif
 
 typedef struct {
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
 _Atomic
 #else 
   volatile
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 5a8d497d2..6e1fd9e99 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Split local region of B into parts */
       for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, js + div_n) - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c
index 9117090b5..1027c0c73 100644
--- a/driver/level3/trmm_L.c
+++ b/driver/level3/trmm_L.c
@@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c
index 62c6a2442..e8df7fb21 100644
--- a/driver/level3/trmm_R.c
+++ b/driver/level3/trmm_R.c
@@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < ls - js; jjs += min_jj){
 	min_jj = ls - js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
 	min_jj = js - ls - min_l - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#ifdef SKYLAKEX
+#if defined(SKYLAKEX) || defined(COOPERLAKE)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/others/Makefile b/driver/others/Makefile
index 5653f3c25..7558ec058 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -47,8 +47,10 @@ endif
 endif
 
 ifdef USE_CUDA
+ifeq ($(USE_CUDA), 1)
 COMMONOBJS	+= cuda_init.$(SUFFIX)
 endif
+endif
 
 ifdef FUNCTION_PROFILE
 COMMONOBJS	+= profile.$(SUFFIX)
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index f13b83dd4..756e51b5d 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -141,7 +141,7 @@ typedef struct {
 
 } thread_status_t;
 
-#if (__STDC_VERSION__ >= 201112L)
+#ifdef HAVE_C11
 #define	atomic_load_queue(p)		__atomic_load_n(p, __ATOMIC_RELAXED)
 #define	atomic_store_queue(p, v)	__atomic_store_n(p, v, __ATOMIC_RELAXED)
 #else
@@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
       }
 }
 
-#if defined(OS_LINUX) && !defined(NO_AFFINITY)
+#if defined(OS_LINUX) && !defined(NO_AFFINITY) 
 int gotoblas_set_affinity(int);
 int gotoblas_set_affinity2(int);
 int get_node(void);
@@ -281,6 +281,8 @@ int get_node(void);
 static int increased_threads = 0;
 
 #ifdef OS_LINUX
+extern int openblas_get_num_threads(void);  
+
 int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
   const int active_threads = openblas_get_num_threads();
 
@@ -602,7 +604,7 @@ int blas_thread_init(void){
       if(ret!=0){
 	struct rlimit rlim;
         const char *msg = strerror(ret);
-        fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
+        fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg);
 #ifdef RLIMIT_NPROC
         if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
           fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 4255852c8..d9969b599 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -55,7 +55,7 @@
 int blas_server_avail = 0;
 
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #else
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
@@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   while(true) {
     for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
       _Bool inuse = false;
       if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
 #else
@@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
       break;
   }
 
-#pragma omp parallel for schedule(OMP_SCHED)
+#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
   for (i = 0; i < num; i ++) {
 
 #ifndef USE_SIMPLE_THREADED_LEVEL3
@@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
     exec_threads(&queue[i], buf_index);
   }
 
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
   atomic_store(&blas_buffer_inuse[buf_index], false);
 #else
   blas_buffer_inuse[buf_index] = false;
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 2e87e186a..5d71b1b2c 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -332,7 +332,7 @@ int support_avx512(){
   if((ebx & (1<<7)) == 0){
       ret=0;  //OS does not even support AVX2
   }
-  if((ebx & (1<<31)) != 0){
+  if((ebx & (1u<<31)) != 0){
     xgetbv(0, &eax, &edx);
     if((eax & 0xe0) == 0xe0)
       ret=1;  //OS supports AVX512VL
@@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
+      case 10:
+    if (model == 5 || model == 6) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+    }
 	return NULL;
       }
       case 0xf:
@@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){
         cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
         if ( (eax & 0xffff)  >= 0x01) {
             cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-            if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
+            if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0)
               return NULL;
           }
         else
@@ -644,7 +656,7 @@ static gotoblas_t *get_coretype(void){
       if ((exfamily == 0) || (exfamily == 2)) {
 	if (ecx & (1 <<  0)) return &gotoblas_OPTERON_SSE3;
 	else return &gotoblas_OPTERON;
-      }  else if (exfamily == 5) {
+      }  else if (exfamily == 5 || exfamily == 7) {
 	return &gotoblas_BOBCAT;
       } else if (exfamily == 6) {
 	if(model == 1){
@@ -698,7 +710,7 @@ static gotoblas_t *get_coretype(void){
 	  }
 	}
       } else if (exfamily == 8) {
-	if (model == 1 || model == 8) {
+	/* if (model == 1 || model == 8) */ {
 	  if(support_avx())
 	    return &gotoblas_ZEN;
 	  else{
@@ -706,16 +718,24 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
-      } else if (exfamily == 9) {
+      } else if (exfamily == 9) {  
 	  if(support_avx())
 	    return &gotoblas_ZEN;
 	  else{
 	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-        }
+          }
+      } else if (exfamily == 10) {  
+	  if(support_avx())
+	    return &gotoblas_ZEN;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+          }
       }else {
 	return &gotoblas_BARCELONA;
       }
+   
     }
   }
 
@@ -764,18 +784,53 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_NORTHWOOD)    return corename[ 3];
   if (gotoblas == &gotoblas_PRESCOTT)     return corename[ 4];
   if (gotoblas == &gotoblas_BANIAS)       return corename[ 5];
-  if (gotoblas == &gotoblas_ATOM)         return corename[ 6];
+  if (gotoblas == &gotoblas_ATOM)
+#ifdef DYNAMIC_OLDER
+           return corename[ 6];
+#else
+           return corename[10];
+#endif
   if (gotoblas == &gotoblas_CORE2)        return corename[ 7];
-  if (gotoblas == &gotoblas_PENRYN)       return corename[ 8];
-  if (gotoblas == &gotoblas_DUNNINGTON)   return corename[ 9];
+  if (gotoblas == &gotoblas_PENRYN)
+#ifdef DYNAMIC_OLDER
+           return corename[ 8];
+#else
+           return corename[7];
+#endif
+  if (gotoblas == &gotoblas_DUNNINGTON)
+#ifdef DYNAMIC_OLDER
+           return corename[ 9];
+#else
+           return corename[7];
+#endif
   if (gotoblas == &gotoblas_NEHALEM)      return corename[10];
   if (gotoblas == &gotoblas_ATHLON)       return corename[11];
-  if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
-  if (gotoblas == &gotoblas_OPTERON)      return corename[13];
+  if (gotoblas == &gotoblas_OPTERON_SSE3)
+#ifdef DYNAMIC_OLDER
+           return corename[12];
+#else
+           return corename[7];
+#endif
+  if (gotoblas == &gotoblas_OPTERON)
+#ifdef DYNAMIC_OLDER
+           return corename[13];
+#else
+           return corename[7];
+#endif
   if (gotoblas == &gotoblas_BARCELONA)    return corename[14];
-  if (gotoblas == &gotoblas_NANO)         return corename[15];
+  if (gotoblas == &gotoblas_NANO)
+#ifdef DYNAMIC_OLDER
+           return corename[15];
+#else
+           return corename[10];
+#endif
   if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
-  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
+  if (gotoblas == &gotoblas_BOBCAT)
+#ifdef DYNAMIC_OLDER
+           return corename[17];
+#else
+           return corename[7];
+#endif
   if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
   if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
   if (gotoblas == &gotoblas_HASWELL)      return corename[20];
@@ -787,6 +842,7 @@ char *gotoblas_corename(void) {
 }
 
 
+
 static gotoblas_t *force_coretype(char *coretype){
 
 	int i ;
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 11ef2725c..157b03365 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -53,10 +53,11 @@ extern gotoblas_t  gotoblas_THUNDERX2T99;
 extern gotoblas_t  gotoblas_TSV110;
 extern gotoblas_t  gotoblas_EMAG8180;
 extern gotoblas_t  gotoblas_NEOVERSEN1;
+extern gotoblas_t  gotoblas_THUNDERX3T110;
 
 extern void openblas_warning(int verbose, const char * msg);
 
-#define NUM_CORETYPES   11
+#define NUM_CORETYPES   12
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -82,6 +83,7 @@ static char *corename[] = {
   "tsv110",
   "emag8180",
   "neoversen1",
+  "thunderx3t110",
   "unknown"
 };
 
@@ -97,6 +99,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_TSV110)       return corename[ 8];
   if (gotoblas == &gotoblas_EMAG8180)     return corename[ 9];
   if (gotoblas == &gotoblas_NEOVERSEN1)   return corename[10];
+  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
   return corename[NUM_CORETYPES];
 }
 
@@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) {
     case  8: return (&gotoblas_TSV110);
     case  9: return (&gotoblas_EMAG8180);
     case 10: return (&gotoblas_NEOVERSEN1);
+    case 11: return (&gotoblas_THUNDERX3T110);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_THUNDERX;
         case 0x0af: // ThunderX2
           return &gotoblas_THUNDERX2T99;
+        case 0x0b8: // ThunderX3
+          return &gotoblas_THUNDERX3T110;
       }
       break;
     case 0x48: // HiSilicon
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 8c831b998..ca1d42408 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -6,6 +6,13 @@ extern gotoblas_t gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 extern gotoblas_t gotoblas_POWER9;
 #endif
+#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
+     || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
+#define HAVE_P10_SUPPORT 1
+#endif
+#ifdef HAVE_P10_SUPPORT
+extern gotoblas_t gotoblas_POWER10;
+#endif
 
 extern void openblas_warning(int verbose, const char *msg);
 
@@ -13,7 +20,8 @@ static char *corename[] = {
 	"unknown",
 	"POWER6",
 	"POWER8",
-	"POWER9"
+	"POWER9",
+	"POWER10"
 };
 
 #define NUM_CORETYPES 4
@@ -23,6 +31,9 @@ char *gotoblas_corename(void) {
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+#endif
+#ifdef HAVE_P10_SUPPORT
+	if (gotoblas == &gotoblas_POWER10)	return corename[4];
 #endif
 	return corename[0];
 }
@@ -36,6 +47,10 @@ static gotoblas_t *get_coretype(void) {
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	if (__builtin_cpu_is("power9"))
 		return &gotoblas_POWER9;
+#endif
+#ifdef HAVE_P10_SUPPORT
+	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
+		return &gotoblas_POWER10;
 #endif
 	return NULL;
 }
@@ -61,6 +76,9 @@ static gotoblas_t *force_coretype(char * coretype) {
 	case  2: return (&gotoblas_POWER8);
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	case  3: return (&gotoblas_POWER9);
+#endif
+#ifdef HAVE_P10_SUPPORT
+	case  4: return (&gotoblas_POWER10);
 #endif
 	default: return NULL;
 	}
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index 90d3051b1..403b34111 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -1,12 +1,58 @@
-
 #include "common.h"
+#include <stdbool.h>
 
+// Gate kernels for z13 and z14 on gcc version
+#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 ||           \
+    /* RHEL 7 since 7.3: */                                              \
+    (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
+     __GNUC_RH_RELEASE__ >= 11)
+#define HAVE_Z13_SUPPORT
+#endif
+
+#if __GNUC__ >= 7
+#define HAVE_Z14_SUPPORT
+#endif
+
+// Guard the use of getauxval() on glibc version >= 2.16
+#ifdef __GLIBC__
+#include <features.h>
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAVE_GETAUXVAL 1
+
+static unsigned long get_hwcap(void)
+{
+	unsigned long hwcap = getauxval(AT_HWCAP);
+	char *maskenv;
+
+	// honor requests for not using specific CPU features in LD_HWCAP_MASK
+	maskenv = getenv("LD_HWCAP_MASK");
+	if (maskenv)
+		hwcap &= strtoul(maskenv, NULL, 0);
+
+	return hwcap;
+	// note that a missing auxval is interpreted as no capabilities
+	// available, which is safe.
+}
+
+#else // __GLIBC_PREREQ(2, 16)
+#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
+
+static unsigned long get_hwcap(void) {
+	// treat missing support for getauxval() as no capabilities available,
+	// which is safe.
+	return 0;
+}
+#endif // __GLIBC_PREREQ(2, 16)
+#endif // __GLIBC
+
+extern gotoblas_t gotoblas_ZARCH_GENERIC;
+#ifdef HAVE_Z13_SUPPORT
 extern gotoblas_t gotoblas_Z13;
+#endif
+#ifdef HAVE_Z14_SUPPORT
 extern gotoblas_t gotoblas_Z14;
-//extern gotoblas_t gotoblas_Z15;
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//extern gotoblas_t gotoblas_Z14;
-//#endif
+#endif
 
 #define NUM_CORETYPES 4
 
@@ -16,47 +62,50 @@ static char* corename[] = {
 	"unknown",
 	"Z13",
 	"Z14",
-//	"Z15",
 	"ZARCH_GENERIC",
 };
 
 char* gotoblas_corename(void) {
+#ifdef HAVE_Z13_SUPPORT
 	if (gotoblas == &gotoblas_Z13)	return corename[1];
+#endif
+#ifdef HAVE_Z14_SUPPORT
 	if (gotoblas == &gotoblas_Z14)	return corename[2];
-//	if (gotoblas == &gotoblas_Z15)	return corename[3];
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//	if (gotoblas == &gotoblas_POWER9)	return corename[3];
-//#endif
-	return corename[0]; // try generic?
+#endif
+	if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
+
+	return corename[0];
 }
 
-// __builtin_cpu_is is not supported by zarch
+/**
+ * Detect the fitting set of kernels by retrieving the CPU features supported by
+ * OS from the auxiliary value AT_HWCAP and choosing the set of kernels
+ * ("coretype") that exploits most of the features and can be compiled with the
+ * available gcc version.
+ * Note that we cannot use vector registers on a z13 or newer unless supported
+ * by the OS kernel (which needs to handle them properly during context switch).
+ */
 static gotoblas_t* get_coretype(void) {
-	FILE* infile;
-	char buffer[512], * p;
 
-	p = (char*)NULL;
-	infile = fopen("/proc/sysinfo", "r");
-	while (fgets(buffer, sizeof(buffer), infile)) {
-		if (!strncmp("Type", buffer, 4)) {
-			p = strchr(buffer, ':') + 2;
-#if 0
-			fprintf(stderr, "%s\n", p);
+	unsigned long hwcap __attribute__((unused)) = get_hwcap();
+
+	// z14 and z15 systems: exploit Vector Facility (SIMD) and
+	// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
+#ifdef HAVE_Z14_SUPPORT
+	if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
+		return &gotoblas_Z14;
 #endif
-			break;
-		}
-	}
 
-	fclose(infile);
+	// z13: Vector Facility (SIMD for double)
+#ifdef HAVE_Z13_SUPPORT
+	if (hwcap & HWCAP_S390_VX)
+		return &gotoblas_Z13;
+#endif
 
-	if (strstr(p, "2964")) return &gotoblas_Z13;
-	if (strstr(p, "2965")) return &gotoblas_Z13;
-	if (strstr(p, "3906")) return &gotoblas_Z14;
-	if (strstr(p, "3907")) return &gotoblas_Z14;
-	if (strstr(p, "8561")) return &gotoblas_Z14;        // fallback z15 to z14
-	if (strstr(p, "8562")) return &gotoblas_Z14;        // fallback z15 to z14
-
-	return NULL; // should be ZARCH_GENERIC
+	// fallback in case of missing compiler support, systems before z13, or
+	// when the OS does not advertise support for the Vector Facility (e.g.,
+	// missing support in the OS kernel)
+	return &gotoblas_ZARCH_GENERIC;
 }
 
 static gotoblas_t* force_coretype(char* coretype) {
@@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
 
 	switch (found)
 	{
+#ifdef HAVE_Z13_SUPPORT
 	case  1: return (&gotoblas_Z13);
+#endif
+#ifdef HAVE_Z14_SUPPORT
 	case  2: return (&gotoblas_Z14);
-//	case  3: return (&gotoblas_Z15);
-//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
-//	case  3: return (&gotoblas_POWER9);
-//#endif
+#endif
+	case  3: return (&gotoblas_ZARCH_GENERIC);
 	default: return NULL;
 	}
 	snprintf(message, 128, "Core not found: %s\n", coretype);
@@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
 
 	if (gotoblas == NULL)
 	{
-		snprintf(coremsg, 128, "Falling back to Z14 core\n");
+		snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
 		openblas_warning(1, coremsg);
-		gotoblas = &gotoblas_Z14;
+		gotoblas = &gotoblas_ZARCH_GENERIC;
 	}
 
 	if (gotoblas && gotoblas->init) {
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 5abcbf3a4..9b6c226a1 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1095,7 +1095,7 @@ static BLASULONG base_address      = 0UL;
 static BLASULONG base_address      = BASE_ADDRESS;
 #endif
 
-#if __STDC_VERSION__ >= 201112L
+#ifdef HAVE_C11
 static _Atomic int memory_initialized = 0;
 #else
 static volatile int memory_initialized = 0;
@@ -2070,7 +2070,7 @@ if (!release->address) return;
   if (munmap(release -> address, BUFFER_SIZE)) {
       int errsv=errno;
        perror("OpenBLAS : munmap failed:");
-       printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
+       printf("error code=%d,\trelease->address=%p\n",errsv,release->address);
   }
 }
 
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index b1f3befae..5d312fa87 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -180,9 +180,10 @@ int get_L2_size(void){
   int eax, ebx, ecx, edx;
 
 #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
-    defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
-    defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
-    defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
+    defined(CORE_PRESCOTT) || defined(CORE_CORE2)       || defined(PENRYN) || defined(DUNNINGTON) || \
+    defined(CORE_NEHALEM)  || defined(CORE_SANDYBRIDGE) || defined(ATOM)   || defined(GENERIC)    || \
+    defined(PILEDRIVER)    || defined(HASWELL)          || defined(STEAMROLLER) || defined(EXCAVATOR) || \
+    defined(ZEN)           || defined(SKYLAKEX)         || defined(COOPERLAKE)
 
   cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
 
@@ -266,7 +267,9 @@ int get_L2_size(void){
 void blas_set_parameter(void){
 
   int factor;
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
+#if defined(BULLDOZER) || defined(PILEDRIVER)  || defined(SANDYBRIDGE) || defined(NEHALEM) || \
+    defined(HASWELL)   || defined(STEAMROLLER) || defined(EXCAVATOR)   || defined(ZEN)     || \
+    defined(SKYLAKEX)  || defined(COOPERLAKE)
   int size = 16;
 #else
   int size = get_L2_size();
diff --git a/exports/Makefile b/exports/Makefile
index 60291b1ff..75901586c 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED
 BUILD_LAPACK_DEPRECATED = 0
 endif
 
+ifndef BUILD_HALF
+BUILD_HALF = 0
+endif
+
 ifeq ($(OSNAME), WINNT)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifndef ONLY_CBLAS
@@ -51,6 +55,10 @@ endif
 endif
 endif
 
+ifeq ($(C_COMPILER), PGI)
+EXTRALIB += -pgf90libs
+endif
+
 ifneq (,$(filter 1 2,$(NOFORTRAN)))
 FEXTRALIB =
 endif
@@ -151,8 +159,12 @@ ifeq ($(F_COMPILER), INTEL)
 	-Wl,--whole-archive $< -Wl,--no-whole-archive \
 	-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
+else ifeq ($(F_COMPILER), FLANG)
+	$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
+	-Wl,--whole-archive $< -Wl,--no-whole-archive \
+	-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 else
-
 ifneq ($(C_COMPILER), LSB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
 	-Wl,--whole-archive $< -Wl,--no-whole-archive \
@@ -234,23 +246,23 @@ static : ../$(LIBNAME)
 	rm -f goto.$(SUFFIX)
 
 osx.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
+	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
 
 aix.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
+	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
 
 objcopy.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
+	perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
 
 objconv.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
+	perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
 
 test : linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
 	rm -f linktest
 
 linktest.c : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
+	perl ./gensymbol linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c
 
 clean ::
 	@rm -f *.def *.dylib __.SYMDEF* *.renamed
diff --git a/exports/gensymbol b/exports/gensymbol
index 235446f14..73b4be248 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -30,7 +30,7 @@
     icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin,
     izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax,
     scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger,
-    shgemm, smax,smin,snrm2,
+    smax,smin,snrm2,
     srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
     ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
     strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot,
@@ -40,17 +40,13 @@
     ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
     xerbla,
     saxpby,daxpby,caxpby,zaxpby,
+    somatcopy, domatcopy, comatcopy, zomatcopy,
+    simatcopy, dimatcopy, cimatcopy, zimatcopy,
     sgeadd,dgeadd,cgeadd,zgeadd,
-    somatcopy,
-    simatcopy,
-    domatcopy,
-    dimatcopy,
-    comatcopy,
-    cimatcopy,
-    zomatcopy,
-    zimatcopy,
+    ssum, dsum, scsum, dzsum
 );
 
+@halfblasobjs = (shgemm);
 @cblasobjs = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
     cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -67,7 +63,7 @@
     cblas_isamax, cblas_izamax,
     cblas_sasum, cblas_saxpy,
     cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
-    cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg,
+    cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
     cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
     cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk,
     cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
@@ -80,9 +76,16 @@
     cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
     cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
     cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
-    cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
+    cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd,
+    cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin,
+    cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin,
+    cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax,
+    cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum,
+    cblas_xerbla
 );
 
+@halfcblasobjs = (cblas_shgemm);
+
 @exblasobjs = (
     qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
     qgemv,qger,qmax,qmin,
@@ -3454,6 +3457,10 @@ use File::Spec;
 use File::Basename;
 my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
 
+if ($ARGV[12] == 1) {
+	@blasobjs = (@blasobjs, @halfblasobjs);
+	@cblasobjs = (@cblasobjs, @halfcblasobjs);
+}
 if ($ARGV[8] == 1) {
     #ONLY_CBLAS=1
     @underscore_objs = (@misc_underscore_objs);
@@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86")    { @underscore_objs = (@underscore_objs, @gemm3mobjs);
 if ($ARGV[1] eq "ia64")   { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
 if ($ARGV[1] eq "MIPS")   { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
 
-
 if ($ARGV[4] == 0) {
     @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs);
+    if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
+    if ($ARGV[1] eq "x86")    { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
+    if ($ARGV[1] eq "ia64")   { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
+    if ($ARGV[1] eq "MIPS")   { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
 }else{
     #NO_CBLAS=1
     @no_underscore_objs = (@misc_no_underscore_objs);
diff --git a/f_check b/f_check
index fac8fc707..dd4d3475c 100644
--- a/f_check
+++ b/f_check
@@ -82,6 +82,9 @@ if ($compiler eq "") {
 		if ($compiler =~ /flang/) {
 		    $vendor = FLANG;
 		    $openmp = "-fopenmp";
+	    } elsif ($compiler =~ /pgf/) {
+		    $vendor = PGI;
+		    $openmp = "-mp";
 		} else {
 		    $vendor = G77;
 		    $openmp = "";
@@ -334,7 +337,8 @@ if ($link ne "") {
 	    && ($flags !~ /kernel32/)
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
-	    && ($flags !~ /omp/)
+	    && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
+	    && ($flags !~ /[0-9]+/)
 		&& ($flags !~ /^\-l$/)
 	    ) {
 	    $linker_l .= $flags . " ";
diff --git a/getarch.c b/getarch.c
index e739d2de9..83043bdf2 100644
--- a/getarch.c
+++ b/getarch.c
@@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/sysinfo.h>
 #include <unistd.h>
 #endif
+#if defined(AIX)
+#include <sys/sysinfo.h>
+#endif
 
+#if defined(__x86_64__) || defined(_M_X64)
 #if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
 #else
 #define NO_AVX512
 #endif
+#endif
 /* #define FORCE_P2		*/
 /* #define FORCE_KATMAI		*/
 /* #define FORCE_COPPERMINE	*/
@@ -360,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
+#ifdef FORCE_COOPERLAKE
+#ifdef NO_AVX512
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "HASWELL"
+#define ARCHCONFIG   "-DHASWELL " \
+                     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+                     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+                     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+                     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DFMA3"
+#define LIBNAME   "haswell"
+#define CORENAME  "HASWELL"
+#else
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "COOPERLAKE"
+#define ARCHCONFIG   "-DCOOPERLAKE " \
+                     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+                     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+                     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+                     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
+#define LIBNAME   "cooperlake"
+#define CORENAME  "COOPERLAKE"
+#endif
+#endif
+
 #ifdef FORCE_ATOM
 #define FORCE
 #define FORCE_INTEL
@@ -650,6 +685,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER9"
 #endif
 
+#if defined(FORCE_POWER10)
+#define FORCE
+#define ARCHITECTURE    "POWER"
+#define SUBARCHITECTURE "POWER10"
+#define SUBDIRNAME      "power"
+#define ARCHCONFIG   "-DPOWER10 " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
+		     "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
+		     "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "power10"
+#define CORENAME  "POWER10"
+#endif
+
 #ifdef FORCE_PPCG4
 #define FORCE
 #define ARCHITECTURE    "POWER"
@@ -1156,6 +1204,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "EMAG8180"
 #endif
 
+#ifdef FORCE_THUNDERX3T110
+#define ARMV8
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "THUNDERX3T110"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DTHUNDERX3T110 " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
+       "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "thunderx3t110"
+#define CORENAME  "THUNDERX3T110"
+#else
+#endif
+
 #ifdef FORCE_ZARCH_GENERIC
 #define FORCE
 #define ARCHITECTURE    "ZARCH"
@@ -1284,6 +1350,11 @@ static int get_num_cores(void) {
   sysctl(m, 2, &count, &len, NULL, 0);
 
   return count;
+
+#elif defined(AIX)
+  //returns the number of processors which are currently online
+  return sysconf(_SC_NPROCESSORS_ONLN);
+
 #else
   return 2;
 #endif
@@ -1362,10 +1433,12 @@ int main(int argc, char *argv[]){
 
 #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
-#endif
-#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
+#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
 printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
 #endif
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
+printf("ELF_VERSION=2\n");
+#endif
 
 #ifdef MAKE_NB_JOBS
   #if MAKE_NB_JOBS > 0
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 5ea39f864..7a8fc6698 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES})
     GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type})
 
     if (USE_GEMM3M)
-      GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type})
+      GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type})
     endif()
   endif ()
   if (${float_type} STREQUAL "COMPLEX")
diff --git a/interface/Makefile b/interface/Makefile
index 741f6bac0..2dbd60073 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -46,7 +46,9 @@ SBLAS3OBJS    = \
 		somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
 		sgeadd.$(SUFFIX)
 
+ifeq ($(BUILD_HALF),1)
 SHBLAS3OBJS    = shgemm.$(SUFFIX)
+endif
 
 DBLAS1OBJS    = \
 		daxpy.$(SUFFIX) dswap.$(SUFFIX) \
@@ -278,7 +280,9 @@ CSBLAS3OBJS   = \
 	cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX)  cblas_simatcopy.$(SUFFIX)\
 	cblas_sgeadd.$(SUFFIX)
 
+ifeq ($(BUILD_HALF),1)
 CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
+endif
 
 CDBLAS1OBJS   = \
 	cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
@@ -363,7 +367,7 @@ CZBLAS3OBJS   +=  cblas_zgemm3m.$(SUFFIX)
 endif
 
 
-ifndef NO_CBLAS
+ifneq ($(NO_CBLAS), 1)
 
 override CFLAGS += -I.
 
@@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c
 xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifeq ($(BUILD_HALF),1)
 shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c
 cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
+ifeq ($(BUILD_HALF),1)
 cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
 
 cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
diff --git a/interface/gemm.c b/interface/gemm.c
index 77a7c8547..68ad8945b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -324,8 +324,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 #ifdef DYNAMIC_ARCH
  if (support_avx512() )
 #endif  
-  if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
-	sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
+  if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
+	SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
 	return;
   }
 
diff --git a/interface/swap.c b/interface/swap.c
index 17a9868a9..ea40b1fc2 100644
--- a/interface/swap.c
+++ b/interface/swap.c
@@ -42,7 +42,7 @@
 #include "functable.h"
 #endif
 
-#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
+#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
 // Multithreaded swap gives performance benefits in ThunderX2T99
 #else
 // Disable multi-threading as it does not show any performance
diff --git a/interface/zswap.c b/interface/zswap.c
index 372b15447..43971b73e 100644
--- a/interface/zswap.c
+++ b/interface/zswap.c
@@ -42,7 +42,7 @@
 #include "functable.h"
 #endif
 
-#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
+#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
 // Multithreaded swap gives performance benefits in ThunderX2T99
 #else
 // Disable multi-threading as it does not show any performance
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 4113a1647..84dd949a4 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -127,17 +127,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 
     # Makefile.L3
     set(USE_TRMM false)
-    if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
+    if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
       set(USE_TRMM true)
     endif ()
-    if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) 
+    if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
       set(USE_TRMM true)
     endif ()
 
+    set(USE_DIRECT_SGEMM false)
+    if (X86_64)
+       set(USE_DIRECT_SGEMM true)
+    endif()
+
+    if (USE_DIRECT_SGEMM)
+	    #	    if (NOT DEFINED SGEMMDIRECTKERNEL)
+	  set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c)
+	  set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
+	  # endif()
+	  GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
+	  GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false  SINGLE)
+    endif()
+
     foreach (float_type SINGLE DOUBLE HALF)
       string(SUBSTRING ${float_type} 0 1 float_char)
       if (${float_type} STREQUAL "HALF")
-	set (float_char "SH")
+        if (NOT ${BUILD_HALF})
+	  continue ()
+        else ()
+	  set (float_char "SH")
+      endif ()
       endif ()
       GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
     endforeach()
diff --git a/kernel/Makefile b/kernel/Makefile
index 9b468a6af..16211218f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,8 +8,14 @@ include $(TOPDIR)/Makefile.system
 
 ifeq ($(C_COMPILER), GCC)
 GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
 endif
 
+ifeq ($(ARCH), power)
+ifeq ($(C_COMPILER), CLANG)
+ override CFLAGS += -fno-integrated-as
+endif
+endif
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
@@ -32,7 +38,22 @@ ifdef NO_AVX2
 endif
 
 ifdef TARGET_CORE
-ifeq ($(TARGET_CORE), SKYLAKEX)
+ifeq ($(TARGET_CORE), COOPERLAKE)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
+ ifeq ($(GCCVERSIONGTEQ10), 1) 
+  override CFLAGS += -march=cooperlake
+ else 
+  override CFLAGS += -march=skylake-avx512
+ endif 
+ ifeq ($(OSNAME), CYGWIN_NT)
+  override CFLAGS += -fno-asynchronous-unwind-tables
+ endif
+ ifeq ($(OSNAME), WINNT)
+  ifeq ($(C_COMPILER), GCC)
+   override CFLAGS += -fno-asynchronous-unwind-tables
+  endif
+ endif
+else ifeq ($(TARGET_CORE), SKYLAKEX)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
  ifeq ($(OSNAME), CYGWIN_NT)
   override CFLAGS += -fno-asynchronous-unwind-tables
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 91d3e43a6..c58787b24 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64)
 USE_GEMM3M = 1
 endif
 
+ifeq ($(ARCH), x86_64)
+USE_DIRECT_SGEMM = 1
+endif
+
 ifeq ($(ARCH), ia64)
 USE_GEMM3M = 1
 endif
@@ -39,18 +43,28 @@ ifeq ($(CORE), SKYLAKEX)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), COOPERLAKE)
+USE_TRMM = 1
+endif
+
 ifeq ($(CORE), ZEN)
 USE_TRMM = 1
 endif
 
 ifeq ($(CORE), POWER8)
+ifeq ($(BINARY64),1)
 USE_TRMM = 1
 endif
+endif
 
 ifeq ($(CORE), POWER9)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), POWER10)
+USE_TRMM = 1
+endif
+
 ifeq ($(ARCH), zarch)
 USE_TRMM = 1
 endif
@@ -59,7 +73,15 @@ ifeq ($(CORE), Z14)
 USE_TRMM = 1
 endif
 
-#ifndef SHGEMMKERNEL
+ifdef USE_DIRECT_SGEMM
+ifndef SGEMMDIRECTKERNEL
+SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
+SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
+endif
+endif
+
+ifeq ($(BUILD_HALF), 1)
+ifndef SHGEMMKERNEL
 SHGEMM_BETA = ../generic/gemm_beta.c
 SHGEMMKERNEL    = ../generic/gemmkernel_2x2.c
 SHGEMMINCOPY    = ../generic/gemm_ncopy_2.c
@@ -70,18 +92,25 @@ SHGEMMINCOPYOBJ =  shgemm_incopy$(TSUFFIX).$(SUFFIX)
 SHGEMMITCOPYOBJ =  shgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SHGEMMONCOPYOBJ =  shgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SHGEMMOTCOPYOBJ =  shgemm_otcopy$(TSUFFIX).$(SUFFIX)
-#endif
+endif
 
 SHKERNELOBJS	+= \
 	shgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
 	$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
+endif
 
 SKERNELOBJS	+= \
 	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
 	$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
 
+ifdef USE_DIRECT_SGEMM
+SKERNELOBJS += \
+	sgemm_direct$(TSUFFIX).$(SUFFIX) \
+	sgemm_direct_performant$(TSUFFIX).$(SUFFIX) 
+endif
+
 DKERNELOBJS	+= \
 	dgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \
@@ -110,7 +139,9 @@ XKERNELOBJS	+= \
 	$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
 	$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
 
+ifeq ($(BUILD_HALF),1)
 SHBLASOBJS      += $(SHKERNELOBJS)
+endif
 SBLASOBJS	+= $(SKERNELOBJS)
 DBLASOBJS	+= $(DKERNELOBJS)
 QBLASOBJS	+= $(QKERNELOBJS)
@@ -118,7 +149,10 @@ CBLASOBJS	+= $(CKERNELOBJS)
 ZBLASOBJS	+= $(ZKERNELOBJS)
 XBLASOBJS	+= $(XKERNELOBJS)
 
+ifeq ($(BUILD_HALF),1)
 SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
+endif
+
 SBLASOBJS	+= \
 	sgemm_beta$(TSUFFIX).$(SUFFIX) \
 	strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
@@ -461,11 +495,13 @@ ZBLASOBJS += \
 	zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
 	zgeadd_k$(TSUFFIX).$(SUFFIX) 
 
-
+ifeq ($(BUILD_HALF), 1)
 SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
+endif
+
 SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@@ -491,8 +527,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 
+ifeq ($(BUILD_HALF),1)
 $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -512,12 +550,16 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
 $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
 
+
+ifeq ($(BUILD_HALF), 1)
+
 $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
+
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s
+	$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s
 	m4 shgemmotcopy.s > shgemmotcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@
 	rm shgemmotcopy.s shgemmotcopy_nomacros.s
@@ -532,7 +574,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
 
 $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s
+	$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s
 	m4 shgemmitcopy.s > shgemmitcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@
 	rm shgemmitcopy.s shgemmitcopy_nomacros.s
@@ -540,6 +582,7 @@ else
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
 endif
 
+endif
 endif
 
 $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
@@ -547,7 +590,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 
 $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s	
+	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
 	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
 	rm sgemmotcopy.s sgemmotcopy_nomacros.s
@@ -563,7 +606,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
 
 $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s	
+	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
 	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
 	rm sgemmitcopy.s sgemmitcopy_nomacros.s
@@ -575,7 +618,7 @@ endif
 
 $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
 	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
 	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
@@ -593,7 +636,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
 
 $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
 	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
 	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
@@ -636,7 +679,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
 
 $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
+	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
 	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
 	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
@@ -659,7 +702,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
 
 $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
 	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
 	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
@@ -691,7 +734,7 @@ endif
 
 $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX  $< -o sgemm_kernel$(TSUFFIX).s
+	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX  $< -o - > sgemm_kernel$(TSUFFIX).s
 	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
@@ -699,19 +742,29 @@ else
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 endif
 
+ifdef USE_DIRECT_SGEMM
+$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+endif
+
+ifeq ($(BUILD_HALF), 1)
+
 $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX  $< -o shgemm_kernel$(TSUFFIX).s
+	$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX  $< -o - > shgemm_kernel$(TSUFFIX).s
 	m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s
 else
 	$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
 endif
+endif
 
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
+	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
 	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
@@ -724,7 +777,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
 
 $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
+	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
 	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
 	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
@@ -734,7 +787,7 @@ endif
 
 $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
+	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
 	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
 	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
@@ -744,7 +797,7 @@ endif
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC  $< -o cgemm_kernel_r.s
+	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC  $< -o - > cgemm_kernel_r.s
 	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
@@ -754,7 +807,7 @@ endif
 
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
+	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
 	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
 	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
@@ -764,7 +817,7 @@ endif
 
 $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
 	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
 	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
@@ -774,7 +827,7 @@ endif
 
 $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
 	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
 	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
@@ -784,7 +837,7 @@ endif
 
 $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
 	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
 	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
@@ -794,7 +847,7 @@ endif
 
 $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
+	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
 	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
 	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
@@ -818,7 +871,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 ifdef USE_TRMM
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s	
 	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
 	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
@@ -828,7 +881,7 @@ endif
 
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s	
 	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
 	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
@@ -838,7 +891,7 @@ endif
 
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s	
 	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
 	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
@@ -848,7 +901,7 @@ endif
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
 	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
@@ -858,7 +911,7 @@ endif
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
 	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
 	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
@@ -868,7 +921,7 @@ endif
 
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
 	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
 	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
@@ -878,7 +931,7 @@ endif
 
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
 	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
 	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
@@ -888,7 +941,7 @@ endif
 
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
 	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
 	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
@@ -910,7 +963,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o ctrmm_kernel_ln.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o - > ctrmm_kernel_ln.s
 	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
 	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
@@ -920,7 +973,7 @@ endif
 
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
 	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
 	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
@@ -930,7 +983,7 @@ endif
 
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
 	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
 	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
@@ -940,7 +993,7 @@ endif
 
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
 	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
 	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
@@ -950,7 +1003,7 @@ endif
 
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
 	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
 	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
@@ -960,7 +1013,7 @@ endif
 
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
 	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
 	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
@@ -970,7 +1023,7 @@ endif
 
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
 	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
 	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
@@ -980,7 +1033,7 @@ endif
 
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
 	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
 	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
@@ -990,7 +1043,7 @@ endif
 
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
 	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
 	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
@@ -1000,7 +1053,7 @@ endif
 
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
 	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
 	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
@@ -1010,7 +1063,7 @@ endif
 
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
 	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
 	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
@@ -1020,7 +1073,7 @@ endif
 
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
 	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
 	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
@@ -1030,7 +1083,7 @@ endif
 
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
 	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
 	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
@@ -1040,7 +1093,7 @@ endif
 
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
 	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
 	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
@@ -1050,7 +1103,7 @@ endif
 
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
 	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
 	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
@@ -1060,7 +1113,7 @@ endif
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
 	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
 	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
@@ -1080,7 +1133,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
 	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
@@ -1214,7 +1267,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
 
 $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
+	$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
 	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
 	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
@@ -2325,8 +2378,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
 $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
 	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
+ifeq ($(BUILD_HALF),1)
 $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@@ -2343,6 +2398,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
 $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
 
+
+ifeq ($(BUILD_HALF), 1)
 $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
 
@@ -2357,6 +2414,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
 	$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
 
 endif
+endif
+
 $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
@@ -2373,7 +2432,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
 
 endif
 
-$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
+$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
@@ -2461,8 +2520,11 @@ endif
 
 endif
 
+
+ifeq ($(BUILD_HALF), 1)
 $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -2481,7 +2543,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
-	$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
+	$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
 	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
@@ -2527,7 +2589,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
 	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index 733c235c6..ba0e57eb5 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 	dot[0]=0.0;
 	dot[1]=0.0;
-
+#if !defined(__PPC__)
 	CREAL(result) = 0.0 ;
 	CIMAG(result) = 0.0 ;
-
+#else
+	result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
+#endif
 	if ( n < 1 )  return(result);
 
 	inc_x2 = 2 * inc_x ;
@@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-	CREAL(result) = dot[0];
+#if !defined(__POWER__)	
+        CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
+#else
+	result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
+#endif
 	return(result);
 
 }
diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53
index c1d33fa3e..eba38a92e 100644
--- a/kernel/arm64/KERNEL.CORTEXA53
+++ b/kernel/arm64/KERNEL.CORTEXA53
@@ -1,3 +1,187 @@
-include $(KERNELDIR)/KERNEL.ARMV8
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
 
 
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
+
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
+
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
+
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
+
+SNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
+
+DDOTKERNEL     = dot.S
+SDOTKERNEL     = dot.S
+CDOTKERNEL     = zdot.S
+ZDOTKERNEL     = zdot.S
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
+else
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+endif
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
new file mode 100644
index 000000000..a20d0d4a6
--- /dev/null
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -0,0 +1,184 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
+DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
+SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
+endif
+
+ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
+CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
+ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
+endif
diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S
index b8d0af5c2..baf39150f 100644
--- a/kernel/arm64/daxpy_thunderx2t99.S
+++ b/kernel/arm64/daxpy_thunderx2t99.S
@@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	X, X, #128
 .endm
 
+/*
+ * No need to do software prefetches if the vector fits
+ * into L1 cache
+ */
+.macro KERNEL_F16_L1CACHE
+	ldp	q4, q5, [X]
+	ldp	q16, q17, [Y]
+
+	ldp	q6, q7, [X, #32]
+	ldp	q18, q19, [Y, #32]
+
+	fmla	v16.2d, v4.2d, v0.d[0]
+	fmla	v17.2d, v5.2d, v0.d[0]
+
+	stp	q16, q17, [Y]
+
+	ldp	q20, q21, [X, #64]
+	ldp	q24, q25, [Y, #64]
+
+	fmla	v18.2d, v6.2d, v0.d[0]
+	fmla	v19.2d, v7.2d, v0.d[0]
+
+	stp	q18, q19, [Y, #32]
+
+	ldp	q22, q23, [X, #96]
+	ldp	q26, q27, [Y, #96]
+
+	fmla	v24.2d, v20.2d, v0.d[0]
+	fmla	v25.2d, v21.2d, v0.d[0]
+
+	stp	q24, q25, [Y, #64]
+
+	fmla	v26.2d, v22.2d, v0.d[0]
+	fmla	v27.2d, v23.2d, v0.d[0]
+
+	stp	q26, q27, [Y, #96]
+
+	add	Y, Y, #128
+	add	X, X, #128
+.endm
+
 .macro KERNEL_F32
 	KERNEL_F16 
 	KERNEL_F16 
 .endm
 
+
+.macro KERNEL_F32_L1CACHE
+	KERNEL_F16_L1CACHE
+	KERNEL_F16_L1CACHE
+.endm
+
 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
@@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	I, xzr
 	beq	.Ldaxpy_kernel_F1
 
+	cmp	N, #2048
+	ble	.Ldaxpy_kernel_F32_L1CACHE
+
 	.align 5
 .Ldaxpy_kernel_F32:
 
@@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	subs	I, I, #1
 	bne	.Ldaxpy_kernel_F32
+	b	.Ldaxpy_kernel_F1
+
+	.align 5
+.Ldaxpy_kernel_F32_L1CACHE:
+
+	KERNEL_F32_L1CACHE
+
+	subs	I, I, #1
+	bne	.Ldaxpy_kernel_F32_L1CACHE
 
 .Ldaxpy_kernel_F1:
 
diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S
new file mode 100644
index 000000000..628a928ca
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S
@@ -0,0 +1,2299 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6  */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+#define A_PRE_SIZE	640
+#define B_PRE_SIZE	224
+#define C_PRE_SIZE	96
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17
+// 18 must save
+// 19 must save
+// 20 must save pA0_2, pA0_3
+// 21 must save pA0_6, pA0_7
+// 22 must save pA1_2, pA1_3
+// 23 must save pA1_6, pA1_7
+// 24 must save pB0_2, pB0_3
+// 25 must save pB0_6, pB0_7
+// 26 must save pB1_2, pB1_3
+// 27 must save pB1_6, pB1_7
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x8_I
+	ldp	q0, q1, [pA], #32
+	ldp	q4, q5, [pB], #32
+
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	ldr	d3, [pA, #8]
+	ldr	d7, [pB, #8]
+	ldr	x22, [pA], #16
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmul	v17.4s, v1.4s, v4.s[0]
+	ldr	x23, [pA], #8
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmul	v19.4s, v1.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmul	v21.4s, v1.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v23.4s, v1.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v25.4s, v1.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v27.4s, v1.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v29.4s, v1.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+	fmul	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro KERNEL8x8_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	d3, [pA, #8]
+	fmov	v1.d[1], x21
+	ldr	d7, [pB, #8]
+	fmov	v5.d[1], x25
+	fmla	v16.4s, v0.4s, v4.s[0]
+	ldr	x22, [pA], #16
+	fmla	v17.4s, v1.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmla	v18.4s, v0.4s, v4.s[1]
+	ldr	x23, [pA], #8
+	fmla	v19.4s, v1.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmla	v20.4s, v0.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro KERNEL8x8_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	d1, [pA, #8]
+	fmov	v3.d[1], x23
+	ldr	d5, [pB, #8]
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	ldr	x20, [pA], #16
+	fmla	v17.4s, v3.4s, v6.s[0]
+	ldr	x24, [pB], #16
+	fmla	v18.4s, v2.4s, v6.s[1]
+	ldr	x21, [pA], #8
+	fmla	v19.4s, v3.4s, v6.s[1]
+	ldr	x25, [pB], #8
+	fmla	v20.4s, v2.4s, v6.s[2]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
+.endm
+
+.macro KERNEL8x8_E
+	fmov	v2.d[1], x22
+	fmov	v6.d[1], x26
+	fmov	v3.d[1], x23
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+	ldp	q0, q1, [pA], #32
+	ldp	q4, q5, [pB], #32
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro SAVE8x8
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+	add	pCRow1, pCRow0, LDC
+
+	ldp	q0, q1, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	stp	q0, q1, [pCRow0]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	pCRow2, pCRow1, LDC
+
+	ldp	q2, q3, [pCRow1]
+	fmla	v2.4s, v18.4s, alphaV2
+	fmla	v3.4s, v19.4s, alphaV3
+	stp	q2, q3, [pCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	add	pCRow1, pCRow2, LDC
+
+	ldp	q4, q5, [pCRow2]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	stp	q4, q5, [pCRow2]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	pCRow2, pCRow1, LDC
+
+	ldp	q6, q7, [pCRow1]
+	fmla	v6.4s, v22.4s, alphaV2
+	fmla	v7.4s, v23.4s, alphaV3
+	stp	q6, q7, [pCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	add	pCRow1, pCRow2, LDC
+
+	ldp	q0, q1, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	fmla	v1.4s, v25.4s, alphaV1
+	stp	q0, q1, [pCRow2]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	pCRow2, pCRow1, LDC
+
+	ldp	q2, q3, [pCRow1]
+	fmla	v2.4s, v26.4s, alphaV2
+	fmla	v3.4s, v27.4s, alphaV3
+	stp	q2, q3, [pCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	add	pCRow1, pCRow2, LDC
+
+	ldp	q4, q5, [pCRow2]
+	fmla	v4.4s, v28.4s, alphaV0
+	fmla	v5.4s, v29.4s, alphaV1
+	stp	q4, q5, [pCRow2]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ldp	q6, q7, [pCRow1]
+	fmla	v6.4s, v30.4s, alphaV2
+	fmla	v7.4s, v31.4s, alphaV3
+	stp	q6, q7, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL4x8_I
+	ldr	q0, [pA], #16
+	ldp	q4, q5, [pB], #32
+
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	ldr	d7, [pB, #8]
+	ldr	x22, [pA], #8
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro KERNEL4x8_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	d7, [pB, #8]
+	fmov	v5.d[1], x25
+	ldr	x22, [pA], #8
+	fmla	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmla	v18.4s, v0.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro KERNEL4x8_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	d5, [pB, #8]
+	fmov	v7.d[1], x27
+	ldr	x20, [pA], #8
+	fmla	v16.4s, v2.4s, v6.s[0]
+	ldr	x24, [pB], #16
+	fmla	v18.4s, v2.4s, v6.s[1]
+	ldr	x25, [pB], #8
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+.endm
+
+.macro KERNEL4x8_E
+	fmov	v2.d[1], x22
+	fmov	v6.d[1], x26
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+	ldr	q0, [pA], #16
+	ldp	q4, q5, [pB], #32
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+	ldr	q0, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	str	q0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	q2, [pCRow1]
+	fmla	v2.4s, v18.4s, alphaV2
+	str	q2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	q4, [pCRow2]
+	fmla	v4.4s, v20.4s, alphaV0
+	str	q4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	q6, [pCRow1]
+	fmla	v6.4s, v22.4s, alphaV2
+	str	q6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	q0, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	str	q0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	q2, [pCRow1]
+	fmla	v2.4s, v26.4s, alphaV2
+	str	q2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	q4, [pCRow2]
+	fmla	v4.4s, v28.4s, alphaV0
+	str	q4, [pCRow2]
+
+	ldr	q6, [pCRow1]
+	fmla	v6.4s, v30.4s, alphaV2
+	str	q6, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+	ldr	d0, [pA], #8
+	ldp	q4, q5, [pB], #32
+
+	fmla	v16.2s, v0.2s, v4.s[0]
+	fmla	v18.2s, v0.2s, v4.s[1]
+	fmla	v20.2s, v0.2s, v4.s[2]
+	fmla	v22.2s, v0.2s, v4.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	v24.2s, v0.2s, v5.s[0]
+	fmla	v26.2s, v0.2s, v5.s[1]
+	fmla	v28.2s, v0.2s, v5.s[2]
+	fmla	v30.2s, v0.2s, v5.s[3]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+	ldr	d0, [pCRow0]
+	fmla	v0.2s, v16.2s, alphaV0
+	str	d0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	d2, [pCRow1]
+	fmla	v2.2s, v18.2s, alphaV2
+	str	d2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	d4, [pCRow2]
+	fmla	v4.2s, v20.2s, alphaV0
+	str	d4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	d6, [pCRow1]
+	fmla	v6.2s, v22.2s, alphaV2
+	str	d6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	d0, [pCRow2]
+	fmla	v0.2s, v24.2s, alphaV0
+	str	d0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	d2, [pCRow1]
+	fmla	v2.2s, v26.2s, alphaV2
+	str	d2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	d4, [pCRow2]
+	fmla	v4.2s, v28.2s, alphaV0
+	str	d4, [pCRow2]
+
+	ldr	d6, [pCRow1]
+	fmla	v6.2s, v30.2s, alphaV2
+	str	d6, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+	ldp	q4, q5, [pB], #32
+	ldr	s0, [pA], #4
+
+	fmla	s16, s0, v4.s[0]
+	fmla	s18, s0, v4.s[1]
+	fmla	s20, s0, v4.s[2]
+	fmla	s22, s0, v4.s[3]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	fmla	s24, s0, v5.s[0]
+	fmla	s26, s0, v5.s[1]
+	fmla	s28, s0, v5.s[2]
+	fmla	s30, s0, v5.s[3]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+	ldr	s0, [pCRow0]
+	fmla	s0, s16, alphaV0
+	str	s0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s2, [pCRow1]
+	fmla	s2, s18, alphaV2
+	str	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s4, [pCRow2]
+	fmla	s4, s20, alphaV0
+	str	s4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s6, [pCRow1]
+	fmla	s6, s22, alphaV2
+	str	s6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s0, [pCRow2]
+	fmla	s0, s24, alphaV0
+	str	s0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s2, [pCRow1]
+	fmla	s2, s26, alphaV2
+	str	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s4, [pCRow2]
+	fmla	s4, s28, alphaV0
+	str	s4, [pCRow2]
+
+	ldr	s6, [pCRow1]
+	fmla	s6, s30, alphaV2
+	str	s6, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, wzr
+	fmov		s23, s16
+.endm
+
+.macro KERNEL8x4_I
+	ldp	q0, q1, [pA], #32
+	ldr	q4, [pB], #16
+
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	ldr	d3, [pA, #8]
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x22, [pA], #16
+	fmul	v17.4s, v1.4s, v4.s[0]
+	ldr	x26, [pB], #8
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x23, [pA], #8
+	fmul	v19.4s, v1.4s, v4.s[1]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v21.4s, v1.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v23.4s, v1.4s, v4.s[3]
+.endm
+
+.macro KERNEL8x4_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	d3, [pA, #8]
+	fmov	v1.d[1], x21
+	ldr	x22, [pA], #16
+	fmla	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #8
+	fmla	v17.4s, v1.4s, v4.s[0]
+	ldr	x23, [pA], #8
+	fmla	v18.4s, v0.4s, v4.s[1]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+.endm
+
+.macro KERNEL8x4_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	d1, [pA, #8]
+	fmov	v3.d[1], x23
+	ldr	x20, [pA], #16
+	fmla	v16.4s, v2.4s, v6.s[0]
+	ldr	x24, [pB], #8
+	fmla	v17.4s, v3.4s, v6.s[0]
+	ldr	x21, [pA], #8
+	fmla	v18.4s, v2.4s, v6.s[1]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+.endm
+
+.macro KERNEL8x4_E
+	fmov	v2.d[1], x22
+	fmov	v6.d[1], x26
+	fmov	v3.d[1], x23
+	fmla	v16.4s, v2.4s, v6.s[0]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+.endm
+
+.macro KERNEL8x4_SUB
+	ldp	q0, q1, [pA], #32
+	ldr	q4, [pB], #16
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+.endm
+
+.macro SAVE8x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+	add	pCRow1, pCRow0, LDC
+
+	ldp	q0, q1, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	stp	q0, q1, [pCRow0]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	pCRow2, pCRow1, LDC
+
+	ldp	q4, q5, [pCRow1]
+	fmla	v4.4s, v18.4s, alphaV0
+	fmla	v5.4s, v19.4s, alphaV1
+	stp	q4, q5, [pCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	add	pCRow1, pCRow2, LDC
+
+	ldp	q0, q1, [pCRow2]
+	fmla	v0.4s, v20.4s, alphaV0
+	fmla	v1.4s, v21.4s, alphaV1
+	stp	q0, q1, [pCRow2]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ldp	q4, q5, [pCRow1]
+	fmla	v4.4s, v22.4s, alphaV0
+	fmla	v5.4s, v23.4s, alphaV1
+	stp	q4, q5, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, wzr
+.endm
+
+.macro KERNEL4x4_I
+	ldr	q0, [pA], #16
+	ldr	q4, [pB], #16
+
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x22, [pA], #8
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x26, [pB], #8
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+.endm
+
+.macro KERNEL4x4_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	x22, [pA], #8
+	ldr	x26, [pB], #8
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+.endm
+
+.macro KERNEL4x4_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	x20, [pA], #8
+	ldr	x24, [pB], #8
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+.endm
+
+.macro KERNEL4x4_E
+	fmov	v2.d[1], x22
+	fmov	v6.d[1], x26
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+.endm
+
+.macro KERNEL4x4_SUB
+	ldr	q0, [pA], #16
+	ldr	q4, [pB], #16
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+.endm
+
+.macro SAVE4x4
+	ldr	q0, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	str	q0, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ldr	q1, [pCRow1]
+	fmla	v1.4s, v18.4s, alphaV2
+	str	q1, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ldr	q2, [pCRow2]
+	fmla	v2.4s, v20.4s, alphaV0
+	str	q2, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ldr	q3, [pCRow1]
+	fmla	v3.4s, v22.4s, alphaV2
+	str	q3, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ldr	d0, [pA], #8
+	ldr	q4, [pB], #16
+
+	fmla	v16.2s, v0.2s, v4.s[0]
+	fmla	v18.2s, v0.2s, v4.s[1]
+	fmla	v20.2s, v0.2s, v4.s[2]
+	fmla	v22.2s, v0.2s, v4.s[3]
+.endm
+
+.macro SAVE2x4
+	ldr	d8, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	str	d8, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ldr	d12, [pCRow1]
+	fmla	v12.2s, v18.2s, alphaV1
+	str	d12, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ldr	d8, [pCRow2]
+	fmla	v8.2s, v20.2s, alphaV2
+	str	d8, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ldr	d12, [pCRow1]
+	fmla	v12.2s, v22.2s, alphaV3
+	str	d12, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+	ld1	{v12.s}[0], [pCRow2]
+	ld1	{v12.s}[1], [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s18, s17
+	fmov	s19, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ldp	q0, q1, [pA], #32
+	ldr	d4, [pB], #8
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+.endm
+
+.macro SAVE8x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+	add	pCRow1, pCRow0, LDC
+
+	ldp	q0, q1, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	stp	q0, q1, [pCRow0]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	pCRow2, pCRow1, LDC
+
+	ldp	q4, q5, [pCRow1]
+	fmla	v4.4s, v18.4s, alphaV0
+	fmla	v5.4s, v19.4s, alphaV1
+	stp	q4, q5, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
+.endm
+
+.macro SAVE4x2
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV2
+	fmla	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+.endm
+
+.macro SAVE2x2
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s4, [pB], #4
+	ldp	q0, q1, [pA], #32
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+	fmla	v17.4s, v1.4s, v4.s[0]
+.endm
+
+.macro SAVE8x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ldp	q0, q1, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	stp	q0, q1, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd	s16, s0, s8, s16
+.endm
+
+.macro SAVE1x1
+	ldr	s8, [pCRow0]
+	fmla	s8, s16, alphaV0
+	str	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+.Lsgemm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr	counterJ, counterJ, #3		// J = J / 8
+	cmp	counterJ, #0
+	ble	.Lsgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lsgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+	mov	pA, origPA			// pA = start of A array
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L8_M8_BEGIN:
+
+	mov	counterI, origM
+	asr	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	.Lsgemm_kernel_L8_M4_BEGIN
+
+.Lsgemm_kernel_L8_M8_20:
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 16 to do?
+	blt	.Lsgemm_kernel_L8_M8_32
+
+	KERNEL8x8_I				// do one in the K
+	KERNEL8x8_M2				// do another in the K
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #2
+	ble	.Lsgemm_kernel_L8_M8_22a
+	.align 5
+
+.Lsgemm_kernel_L8_M8_22:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M8_22
+
+.Lsgemm_kernel_L8_M8_22a:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	 .Lsgemm_kernel_L8_M8_44
+
+.Lsgemm_kernel_L8_M8_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L8_M8_40
+
+	KERNEL8x8_I
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	.Lsgemm_kernel_L8_M8_44
+
+.Lsgemm_kernel_L8_M8_40:
+
+	INIT8x8
+
+.Lsgemm_kernel_L8_M8_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L8_M8_100
+
+.Lsgemm_kernel_L8_M8_46:
+
+	KERNEL8x8_SUB
+
+	subs	counterL, counterL, 1
+	bgt	.Lsgemm_kernel_L8_M8_46
+
+.Lsgemm_kernel_L8_M8_100:
+
+	SAVE8x8
+
+.Lsgemm_kernel_L8_M8_END:
+	subs	counterI, counterI, #1
+	bne	.Lsgemm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lsgemm_kernel_L8_END
+
+	tst	counterI, #4
+	ble	.Lsgemm_kernel_L8_M2_BEGIN
+
+.Lsgemm_kernel_L8_M4_20:
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lsgemm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lsgemm_kernel_L8_M4_22a
+	.align 5
+
+.Lsgemm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M4_22
+
+.Lsgemm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 .Lsgemm_kernel_L8_M4_44
+
+.Lsgemm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+	KERNEL4x8_E
+
+	b	.Lsgemm_kernel_L8_M4_44
+
+.Lsgemm_kernel_L8_M4_40:
+
+	INIT4x8
+
+.Lsgemm_kernel_L8_M4_44:
+
+	ands	counterL , origK, #1
+	ble	.Lsgemm_kernel_L8_M4_100
+
+.Lsgemm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+.Lsgemm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+.Lsgemm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lsgemm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lsgemm_kernel_L8_M1_BEGIN
+
+.Lsgemm_kernel_L8_M2_20:
+
+	INIT2x8
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L8_M2_40
+
+.Lsgemm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M2_22
+
+
+.Lsgemm_kernel_L8_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L8_M2_100
+
+.Lsgemm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M2_42
+
+.Lsgemm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+.Lsgemm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lsgemm_kernel_L8_END
+
+.Lsgemm_kernel_L8_M1_20:
+
+	INIT1x8
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L8_M1_40
+
+.Lsgemm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M1_22
+
+
+.Lsgemm_kernel_L8_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L8_M1_100
+
+.Lsgemm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_M1_42
+
+.Lsgemm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+.Lsgemm_kernel_L8_END:
+	lsl	temp, origK, #5			// B = B + K * 4 * 8
+	add	origPB, origPB, temp
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lsgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lsgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	.Lsgemm_kernel_L999
+
+	tst	counterJ , #4
+	ble	.Lsgemm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #2
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	.Lsgemm_kernel_L4_M4_BEGIN
+
+.Lsgemm_kernel_L4_M8_20:
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lsgemm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lsgemm_kernel_L4_M8_22a
+	.align 5
+
+.Lsgemm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M8_22
+
+.Lsgemm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 .Lsgemm_kernel_L4_M8_44
+
+.Lsgemm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	.Lsgemm_kernel_L4_M8_44
+
+.Lsgemm_kernel_L4_M8_40:
+
+	INIT8x4
+
+.Lsgemm_kernel_L4_M8_44:
+
+	ands	counterL , origK, #1
+	ble	.Lsgemm_kernel_L4_M8_100
+
+.Lsgemm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+.Lsgemm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+.Lsgemm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	.Lsgemm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lsgemm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	.Lsgemm_kernel_L4_M2_BEGIN
+
+.Lsgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lsgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lsgemm_kernel_L4_M4_22a
+	.align 5
+
+.Lsgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M4_22
+
+.Lsgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 .Lsgemm_kernel_L4_M4_44
+
+.Lsgemm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	.Lsgemm_kernel_L4_M4_44
+
+.Lsgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+.Lsgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	.Lsgemm_kernel_L4_M4_100
+
+.Lsgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+.Lsgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+.Lsgemm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lsgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lsgemm_kernel_L4_M1_BEGIN
+
+.Lsgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L4_M2_40
+
+.Lsgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M2_22
+
+
+.Lsgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L4_M2_100
+
+.Lsgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M2_42
+
+.Lsgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+.Lsgemm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lsgemm_kernel_L4_END
+
+.Lsgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L4_M1_40
+
+.Lsgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M1_22
+
+
+.Lsgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L4_M1_100
+
+.Lsgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_M1_42
+
+.Lsgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+.Lsgemm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lsgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lsgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI,#0
+	ble	.Lsgemm_kernel_L2_M4_BEGIN
+
+.Lsgemm_kernel_L2_M8_20:
+
+	INIT8x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lsgemm_kernel_L2_M8_40
+	.align 5
+
+.Lsgemm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M8_22
+
+
+.Lsgemm_kernel_L2_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L2_M8_100
+
+.Lsgemm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M8_42
+
+.Lsgemm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+.Lsgemm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lsgemm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lsgemm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	.Lsgemm_kernel_L2_M2_BEGIN
+
+.Lsgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lsgemm_kernel_L2_M4_40
+	.align 5
+
+.Lsgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M4_22
+
+
+.Lsgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L2_M4_100
+
+.Lsgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M4_42
+
+.Lsgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+.Lsgemm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lsgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lsgemm_kernel_L2_M1_BEGIN
+
+.Lsgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	.Lsgemm_kernel_L2_M2_40
+
+.Lsgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M2_22
+
+
+.Lsgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L2_M2_100
+
+.Lsgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M2_42
+
+.Lsgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+.Lsgemm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lsgemm_kernel_L2_END
+
+.Lsgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	.Lsgemm_kernel_L2_M1_40
+
+.Lsgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M1_22
+
+
+.Lsgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L2_M1_100
+
+.Lsgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_M1_42
+
+.Lsgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+.Lsgemm_kernel_L2_END:
+
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lsgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lsgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr	counterI, counterI, #3
+	cmp	counterI, #0
+	ble	.Lsgemm_kernel_L1_M4_BEGIN
+
+.Lsgemm_kernel_L1_M8_20:
+
+	INIT8x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L1_M8_40
+	.align 5
+
+.Lsgemm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M8_22
+
+
+.Lsgemm_kernel_L1_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L1_M8_100
+
+.Lsgemm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M8_42
+
+.Lsgemm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+.Lsgemm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lsgemm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lsgemm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	.Lsgemm_kernel_L1_M2_BEGIN
+
+.Lsgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L1_M4_40
+	.align 5
+
+.Lsgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M4_22
+
+
+.Lsgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L1_M4_100
+
+.Lsgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M4_42
+
+.Lsgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+.Lsgemm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lsgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lsgemm_kernel_L1_M1_BEGIN
+
+.Lsgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L1_M2_40
+
+.Lsgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M2_22
+
+
+.Lsgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L1_M2_100
+
+.Lsgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M2_42
+
+.Lsgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+.Lsgemm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lsgemm_kernel_L1_END
+
+.Lsgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lsgemm_kernel_L1_M1_40
+
+.Lsgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M1_22
+
+
+.Lsgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lsgemm_kernel_L1_M1_100
+
+.Lsgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_M1_42
+
+.Lsgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+.Lsgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S
new file mode 100644
index 000000000..f99b1d992
--- /dev/null
+++ b/kernel/arm64/sgemm_ncopy_8.S
@@ -0,0 +1,562 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	A00	x2
+#define	LDA	x3
+#define	B00	x4
+
+#define	A01	x5
+#define	A02	x6
+#define	A03	x7
+#define	A04	x8
+#define	A05	x9
+#define	A06	x10
+#define	A07	x11
+#define	A08	x12
+
+#define I	x13
+#define	J	x14
+#define	K	x15
+
+#define	TEMP1	x16
+#define	TEMP2	x17
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro COPY4x8
+	ldr	q0, [A01], #16
+	ldr	q1, [A02], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v10.s[0], v0.s[1]
+	ins	v12.s[0], v0.s[2]
+	ins	v14.s[0], v0.s[3]
+	ins	v8.s[1], v1.s[0]
+	ins	v10.s[1], v1.s[1]
+	ins	v12.s[1], v1.s[2]
+	ins	v14.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ldr	q3, [A04], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v10.s[2], v2.s[1]
+	ins	v12.s[2], v2.s[2]
+	ins	v14.s[2], v2.s[3]
+	ins	v8.s[3], v3.s[0]
+	ins	v10.s[3], v3.s[1]
+	ins	v12.s[3], v3.s[2]
+	ins	v14.s[3], v3.s[3]
+
+	ldr	q4, [A05], #16
+	ldr	q5, [A06], #16
+	ins	v9.s[0], v4.s[0]
+	ins	v11.s[0], v4.s[1]
+	ins	v13.s[0], v4.s[2]
+	ins	v15.s[0], v4.s[3]
+	ins	v9.s[1], v5.s[0]
+	ins	v11.s[1], v5.s[1]
+	ins	v13.s[1], v5.s[2]
+	ins	v15.s[1], v5.s[3]
+
+	ldr	q6, [A07], #16
+	ldr	q7, [A08], #16
+	ins	v9.s[2], v6.s[0]
+	ins	v11.s[2], v6.s[1]
+	ins	v13.s[2], v6.s[2]
+	ins	v15.s[2], v6.s[3]
+	ins	v9.s[3], v7.s[0]
+	ins	v11.s[3], v7.s[1]
+	ins	v13.s[3], v7.s[2]
+	ins	v15.s[3], v7.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64
+.endm
+
+.macro COPY2x8
+	ldr	d0, [A01], #8
+	ldr	d1, [A02], #8
+	ins	v8.s[0], v0.s[0]
+	ins	v10.s[0], v0.s[1]
+	ins	v8.s[1], v1.s[0]
+	ins	v10.s[1], v1.s[1]
+
+	ldr	d2, [A03], #8
+	ldr	d3, [A04], #8
+	ins	v8.s[2], v2.s[0]
+	ins	v10.s[2], v2.s[1]
+	ins	v8.s[3], v3.s[0]
+	ins	v10.s[3], v3.s[1]
+
+	ldr	d4, [A05], #8
+	ldr	d5, [A06], #8
+	ins	v9.s[0], v4.s[0]
+	ins	v11.s[0], v4.s[1]
+	ins	v9.s[1], v5.s[0]
+	ins	v11.s[1], v5.s[1]
+
+	ldr	d6, [A07], #8
+	ldr	d7, [A08], #8
+	ins	v9.s[2], v6.s[0]
+	ins	v11.s[2], v6.s[1]
+	ins	v9.s[3], v7.s[0]
+	ins	v11.s[3], v7.s[1]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
+.endm
+
+.macro COPY1x8
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ins	v8.s[0], v0.s[0]
+	ins	v8.s[1], v1.s[0]
+
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+	ins	v8.s[2], v2.s[0]
+	ins	v8.s[3], v3.s[0]
+
+	ldr	s4, [A05], #4
+	ldr	s5, [A06], #4
+	ins	v9.s[0], v4.s[0]
+	ins	v9.s[1], v5.s[0]
+
+	ldr	s6, [A07], #4
+	ldr	s7, [A08], #4
+	ins	v9.s[2], v6.s[0]
+	ins	v9.s[3], v7.s[0]
+
+	st1	{v8.4s, v9.4s}, [B00], #32
+.endm
+
+.macro COPY4x4
+	ldr	q0, [A01], #16
+	ldr	q1, [A02], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ldr	q3, [A04], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v10.s[2], v2.s[2]
+	ins	v11.s[2], v2.s[3]
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+	ins	v10.s[3], v3.s[2]
+	ins	v11.s[3], v3.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
+.endm
+
+.macro COPY2x4
+	ldr	d0, [A01], #8
+	ldr	d1, [A02], #8
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+
+	ldr	d2, [A03], #8
+	ldr	d3, [A04], #8
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+
+	st1	{v8.4s, v9.4s}, [B00], #32
+.endm
+
+.macro COPY1x4
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ins	v8.s[0], v0.s[0]
+	ins	v8.s[1], v1.s[0]
+
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+	ins	v8.s[2], v2.s[0]
+	ins	v8.s[3], v3.s[0]
+
+	st1	{v8.4s}, [B00], #16
+.endm
+
+.macro COPY4x2
+	ldr	q0, [A01], #16
+	ldr	q1, [A02], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32
+.endm
+
+.macro COPY2x2
+	ldr	d0, [A01], #8
+	ldr	d1, [A02], #8
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+
+	st1	{v8.2s, v9.2s}, [B00], #16
+.endm
+
+.macro COPY1x2
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ins	v8.s[0], v0.s[0]
+	ins	v8.s[1], v1.s[0]
+
+	st1	{v8.2s}, [B00], #8
+.endm
+
+.macro COPY1x1
+	ldr	s0, [A01], #4
+	str	s0, [B00], #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+.Lsgemm_ncopy_L8_BEGIN:
+
+	asr	J, N, #3					// J = N / 8
+	cmp 	J, #0
+	ble	.Lsgemm_ncopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_ncopy_L8_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A00, A08, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L8_M4_40
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A01
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_1:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_1
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A02
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_2:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_2
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A03
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_3:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_3
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A04
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_4:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_4
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A05
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_5:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_5
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A06
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_6:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_6
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A07
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_7:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_7
+
+	asr	K, M, #4					// K = M / 16(cacheline)
+	mov	TEMP1, A08
+
+	.align	5
+.Lsgemm_tcopy_L8_warnup_8:
+
+	ldr	s0, [TEMP1], #64
+
+	subs	K, K, #1
+	bgt	.Lsgemm_tcopy_L8_warnup_8
+
+	.align	5
+.Lsgemm_ncopy_L8_M4_20:
+
+	COPY4x8
+
+	subs	I, I, #1
+	bne	.Lsgemm_ncopy_L8_M4_20
+
+.Lsgemm_ncopy_L8_M4_40:
+
+	and	I, M, #2
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L8_M4_60
+
+	COPY2x8
+
+.Lsgemm_ncopy_L8_M4_60:
+
+	and	I, M, #1
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L8_M4_END
+
+	COPY1x8
+
+.Lsgemm_ncopy_L8_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	.Lsgemm_ncopy_L8_M4_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_ncopy_L4_BEGIN:
+
+	tst	N, #7
+	ble	.Lsgemm_ncopy_L999
+
+	tst	N, #4
+	ble	.Lsgemm_ncopy_L2_BEGIN
+
+.Lsgemm_ncopy_L4_M4_BEGIN:
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A00, A04, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp 	I, #0
+	ble	.Lsgemm_ncopy_L4_M4_40
+
+	.align	5
+.Lsgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I, I, #1
+	bne	.Lsgemm_ncopy_L4_M4_20
+
+.Lsgemm_ncopy_L4_M4_40:
+
+	and	I, M, #2
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L4_M4_60
+
+	COPY2x4
+
+.Lsgemm_ncopy_L4_M4_60:
+
+	and	I, M, #1
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L4_M4_END
+
+	COPY1x4
+
+.Lsgemm_ncopy_L4_M4_END:
+
+
+/*********************************************************************************************/
+
+.Lsgemm_ncopy_L2_BEGIN:
+
+	tst	N, #2
+	ble	.Lsgemm_ncopy_L1_BEGIN
+
+.Lsgemm_ncopy_L2_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A00, A02, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L2_M4_40
+
+	.align	5
+.Lsgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_ncopy_L2_M4_20
+
+
+.Lsgemm_ncopy_L2_M4_40:
+
+	and	I, M, #2
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L2_M4_60
+
+	COPY2x2
+
+.Lsgemm_ncopy_L2_M4_60:
+
+	and	I, M, #1
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L2_M4_END
+
+	COPY1x2
+
+.Lsgemm_ncopy_L2_M4_END:
+
+.Lsgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	.Lsgemm_ncopy_L999
+
+.Lsgemm_ncopy_L1_M1_BEGIN:
+
+	mov	A01, A00
+
+	mov	I, M
+	cmp	I, #0
+	ble	.Lsgemm_ncopy_L1_M1_END
+
+	.align	5
+.Lsgemm_ncopy_L1_M1_20:
+
+	COPY1x1
+
+	subs	I, I, #1
+	bne	.Lsgemm_ncopy_L1_M1_20
+
+.Lsgemm_ncopy_L1_M1_END:
+
+.Lsgemm_ncopy_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
diff --git a/kernel/arm64/sgemm_tcopy_8.S b/kernel/arm64/sgemm_tcopy_8.S
new file mode 100644
index 000000000..7d81ba266
--- /dev/null
+++ b/kernel/arm64/sgemm_tcopy_8.S
@@ -0,0 +1,707 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+
+#define	M		x0
+#define	N		x1
+#define	A		x2
+#define	LDA		x3
+#define	B		x4
+
+#define M8		x5
+
+#define	A01		x6
+#define	A02		x7
+#define	A03		x8
+#define	A04		x9
+#define	A05		x10
+#define	A06		x11
+#define	A07		x12
+#define	A08		x13
+
+#define	B01		x14
+#define	B02		x15
+#define	B03		x16
+#define	B04		x17
+#define	B00		x22
+
+
+#define I		x18
+#define	J		x19
+
+#define TEMP1		x20
+
+#define A_PREFETCH	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY8x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ldp	q8, q9, [A05]
+	ldp	q10, q11, [A06]
+	add	A05, A05, #32
+	add	A06, A06, #32
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ldp	q12, q13, [A07]
+	ldp	q14, q15, [A08]
+	add	A07, A07, #32
+	add	A08, A08, #32
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY4x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+
+	ldr	q4, [A05]
+	ldr	q5, [A06]
+	ldr	q6, [A07]
+	ldr	q7, [A08]
+
+	add	A05, A05, #16
+	add	A06, A06, #16
+	add	A07, A07, #16
+	add	A08, A08, #16
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY2x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B02]
+	add	B02, B02, #16
+	stp	d2, d3, [B02]
+	add	B02, B02, #16
+
+	ldr	d4, [A05]
+	ldr	d5, [A06]
+	ldr	d6, [A07]
+	ldr	d7, [A08]
+
+	add	A05, A05, #8
+	add	A06, A06, #8
+	add	A07, A07, #8
+	add	A08, A08, #8
+
+	stp	d4, d5, [B02]
+	add	B02, B02, #16
+	stp	d6, d7, [B02]
+	add	B02, B02, #16
+
+.endm
+
+.macro COPY1x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B03]
+	add	B03, B03, #8
+	stp	s2, s3, [B03]
+	add	B03, B03, #8
+
+	ldr	s4, [A05]
+	ldr	s5, [A06]
+	ldr	s6, [A07]
+	ldr	s7, [A08]
+
+	ldr	d4, [A05], #8
+	ldr	d5, [A06], #8
+	ldr	d6, [A07], #8
+	ldr	d7, [A08], #8
+
+	stp	s4, s5, [B03]
+	add	B03, B03, #8
+	stp	s6, s7, [B03]
+	add	B03, B03, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY8x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY4x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+
+	add	B01, B01, #64
+.endm
+
+.macro COPY2x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B02]
+	add	B02, B02, #16
+	stp	d2, d3, [B02]
+
+	add	B02, B02, #16
+.endm
+
+.macro COPY1x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B03]
+	add	B03, B03, #8
+	stp	s2, s3, [B03]
+	add	B03, B03, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY8x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s}, [A01]
+	ld1	{v2.4s, v3.4s}, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	B00, B00, M8
+.endm
+
+.macro COPY4x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	add	A01, A01, #16
+	add	A02, A02, #16
+
+	stp	q0, q1, [B01]
+	add	B01, B01, #32
+.endm
+
+.macro COPY2x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+
+	add	A01, A01, #8
+	add	A02, A02, #8
+
+	stp	d0, d1, [B02]
+	add	B02, B02, #16
+.endm
+
+.macro COPY1x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+
+	add	A01, A01, #4
+	add	A02, A02, #4
+
+	stp	s0, s1, [B03]
+
+	add	B03, B03, #8
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY8x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	add	A01, A01, #32
+	stp	q0, q1, [B00]
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY4x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr q0, [A01]
+	add	A01, A01, #16
+	str q0, [B01]
+
+	add	B01, B01, #16
+.endm
+
+.macro COPY2x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	add	A01, A01, #8
+	str d0, [B02]
+
+	add	B02, B02, #8
+.endm
+
+.macro COPY1x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	add	A01, A01, #4
+	str	s0, [B03]
+
+	add	B03, B03, #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
+
+	and	B01 , N , #-8
+	and	B02 , N , #-4
+	and	B03 , N , #-2
+
+	mul	B01, B01, TEMP1
+	mul	B02, B02, TEMP1
+	mul	B03, B03, TEMP1
+
+	add	B01 , B01, B
+	add	B02 , B02, B
+	add	B03 , B03, B
+
+	lsl	M8, M, #5					// M8 = M * 8 * SIZE
+
+.Lsgemm_tcopy_L8_BEGIN:
+
+	asr 	J, M, #3					// J = M / 8
+	cmp 	J, #0
+	ble	.Lsgemm_tcopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_tcopy_L8_M8_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A, A08, LDA
+
+	mov	B00, B
+	add	B, B00, #256					// B = B + 8 * 8 * SIZE
+
+	asr	I, N, #3					// I = N / 8
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L8_M8_40
+
+	.align	5
+.Lsgemm_tcopy_L8_M8_20:
+
+	COPY8x8
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L8_M8_20
+
+.Lsgemm_tcopy_L8_M8_40:
+
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L8_M8_60
+
+	COPY4x8
+
+.Lsgemm_tcopy_L8_M8_60:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L8_M8_80
+
+	COPY2x8
+
+.Lsgemm_tcopy_L8_M8_80:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L8_M8_END
+
+	COPY1x8
+
+.Lsgemm_tcopy_L8_M8_END:
+
+	subs	J, J, #1						// j--
+	bne	.Lsgemm_tcopy_L8_M8_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L4_BEGIN:
+
+	tst	M, #7
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #4
+	ble	.Lsgemm_tcopy_L2_BEGIN
+
+.Lsgemm_tcopy_L4_M8_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A, A04, LDA
+
+	mov	B00, B
+	add	B, B00, #128					// B = B + 4 * 8 * SIZE
+
+	asr	I, N, #3					// I = N / 8
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L4_M8_40
+
+	.align	5
+.Lsgemm_tcopy_L4_M8_20:
+
+	COPY8x4
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L4_M8_20
+
+.Lsgemm_tcopy_L4_M8_40:
+
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L4_M8_60
+
+	COPY4x4
+
+.Lsgemm_tcopy_L4_M8_60:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L4_M8_80
+
+	COPY2x4
+
+.Lsgemm_tcopy_L4_M8_80:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L4_M8_END
+
+	COPY1x4
+
+
+.Lsgemm_tcopy_L4_M8_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #2
+	ble	.Lsgemm_tcopy_L1_BEGIN
+
+.Lsgemm_tcopy_L2_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A, A02, LDA
+
+	mov	B00, B
+	add	B, B00, #64					// B = B + 2 * 8 * SIZE
+
+	asr	I, N, #3					// I = N / 8
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L2_M8_40
+
+	.align	5
+.Lsgemm_tcopy_L2_M8_20:
+
+	COPY8x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L2_M8_20
+
+.Lsgemm_tcopy_L2_M8_40:
+
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L2_M8_60
+
+	COPY4x2
+
+.Lsgemm_tcopy_L2_M8_60:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L2_M8_80
+
+	COPY2x2
+
+.Lsgemm_tcopy_L2_M8_80:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L2_M8_END
+
+	COPY1x2
+
+.Lsgemm_tcopy_L2_M8_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	.Lsgemm_tcopy_L999
+
+
+.Lsgemm_tcopy_L1_M16_BEGIN:
+
+	mov	A01, A						// A01 = A
+	mov	B00, B
+
+	asr	I, N, #3					// I = M / 8
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L1_M8_40
+
+	.align	5
+.Lsgemm_tcopy_L1_M8_20:
+
+	COPY8x1
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L1_M8_20
+
+.Lsgemm_tcopy_L1_M8_40:
+
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L1_M8_60
+
+	COPY4x1
+
+.Lsgemm_tcopy_L1_M8_60:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L1_M8_80
+
+	COPY2x1
+
+.Lsgemm_tcopy_L1_M8_80:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L1_M8_END
+
+	COPY1x1
+
+
+.Lsgemm_tcopy_L1_M8_END:
+
+.Lsgemm_tcopy_L999:
+
+	mov	x0, #0						// set return value
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
diff --git a/kernel/arm64/strmm_kernel_8x8_cortexa53.S b/kernel/arm64/strmm_kernel_8x8_cortexa53.S
new file mode 100644
index 000000000..4b84623f3
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_8x8_cortexa53.S
@@ -0,0 +1,2823 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6               x7 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save pA0_2, pA0_3
+// 21 must save pA0_6, pA0_7
+// 22 must save pA1_2, pA1_3
+// 23 must save pA1_6, pA1_7
+// 24 must save pB0_2, pB0_3
+// 25 must save pB0_6, pB0_7
+// 26 must save pB1_2, pB1_3
+// 27 must save pB1_6, pB1_7
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x8_I
+	ld1	{v0.4s, v1.4s}, [pA], #32
+	ld1	{v4.4s, v5.4s}, [pB], #32
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	ldr	d3, [pA, #8]
+	ldr	d7, [pB, #8]
+
+	ldr	x22, [pA], #16
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmul	v17.4s, v1.4s, v4.s[0]
+	ldr	x23, [pA], #8
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmul	v19.4s, v1.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v21.4s, v1.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v23.4s, v1.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v25.4s, v1.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v27.4s, v1.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v29.4s, v1.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+	fmul	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro KERNEL8x8_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	d3, [pA, #8]
+	fmov	v1.d[1], x21
+	ldr	d7, [pB, #8]
+	fmov	v5.d[1], x25
+	fmla	v16.4s, v0.4s, v4.s[0]
+	ldr	x22, [pA], #16
+	fmla	v17.4s, v1.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmla	v18.4s, v0.4s, v4.s[1]
+	ldr	x23, [pA], #8
+	fmla	v19.4s, v1.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro KERNEL8x8_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	d1, [pA, #8]
+	fmov	v3.d[1], x23
+	ldr	d5, [pB, #8]
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	ldr	x20, [pA], #16
+	fmla	v17.4s, v3.4s, v6.s[0]
+	ldr	x24, [pB], #16
+	fmla	v18.4s, v2.4s, v6.s[1]
+	ldr	x21, [pA], #8
+	fmla	v19.4s, v3.4s, v6.s[1]
+	ldr	x25, [pB], #8
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
+.endm
+
+.macro KERNEL8x8_E
+	fmov	v2.d[1], x22
+	fmov	v6.d[1], x26
+	fmov	v3.d[1], x23
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
+.endm
+
+.macro SAVE8x8
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v2.4s, v18.4s, alphaV2
+	fmul	v3.4s, v19.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v6.4s, v22.4s, alphaV2
+	fmul	v7.4s, v23.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v2.4s, v26.4s, alphaV2
+	fmul	v3.4s, v27.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	fmul	v6.4s, v30.4s, alphaV2
+	fmul	v7.4s, v31.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL4x8_I
+	ld1	{v0.4s}, [pA], #16
+	ld1	{v4.4s, v5.4s}, [pB], #32
+
+	ldr	d2, [pA], #8
+	ldr	d6, [pB], #8
+	ldr	d7, [pB, #8]
+	ldr	x21, [pA], #8
+	fmul	v16.4s, v0.4s, v4.s[0]
+	ldr	x26, [pB], #16
+	fmul	v18.4s, v0.4s, v4.s[1]
+	ldr	x27, [pB], #8
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro KERNEL4x8_M1
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d6, [pB], #8
+	fmov	v4.d[1], x24
+	ldr	d7, [pB, #8]
+	fmov	v5.d[1], x25
+	fmla	v16.4s, v0.4s, v4.s[0]
+	ldr	x21, [pA], #8
+	fmla	v18.4s, v0.4s, v4.s[1]
+	ldr	x26, [pB], #16
+	fmla	v20.4s, v0.4s, v4.s[2]
+	ldr	x27, [pB], #8
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro KERNEL4x8_M2
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x21
+	ldr	d4, [pB], #8
+	fmov	v6.d[1], x26
+	ldr	d5, [pB, #8]
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	ldr	x20, [pA], #8
+	fmla	v18.4s, v2.4s, v6.s[1]
+	ldr	x24, [pB], #16
+	fmla	v20.4s, v2.4s, v6.s[2]
+	ldr	x25, [pB], #8
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+.endm
+
+.macro KERNEL4x8_E
+	fmov	v2.d[1], x21
+	fmov	v6.d[1], x26
+	fmov	v7.d[1], x27
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	st1 	{v0.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.4s, v18.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v6.4s, v22.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0
+	st1 	{v0.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.4s, v26.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.4s, v28.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+
+	fmul	v6.4s, v30.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v4.s[0]
+	fmla	v18.2s, v0.2s, v4.s[1]
+	fmla	v20.2s, v0.2s, v4.s[2]
+	fmla	v22.2s, v0.2s, v4.s[3]
+	fmla	v24.2s, v0.2s, v5.s[0]
+	fmla	v26.2s, v0.2s, v5.s[1]
+	fmla	v28.2s, v0.2s, v5.s[2]
+	fmla	v30.2s, v0.2s, v5.s[3]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.2s, v16.2s, alphaV0
+	st1 	{v0.2s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.2s, v18.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.2s, v20.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v6.2s, v22.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.2s, v24.2s, alphaV0
+	st1 	{v0.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.2s, v26.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.2s, v28.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+
+	fmul	v6.2s, v30.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	fmla	s16, s0, v4.s[0]
+	fmla	s18, s0, v4.s[1]
+	fmla	s20, s0, v4.s[2]
+	fmla	s22, s0, v4.s[3]
+	fmla	s24, s0, v5.s[0]
+	fmla	s26, s0, v5.s[1]
+	fmla	s28, s0, v5.s[2]
+	fmla	s30, s0, v5.s[3]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	s0, s16, alphaV0
+	str 	s0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s2, s18, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s4, s20, alphaV0
+	str 	s4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s6, s22, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s0, s24, alphaV0
+	str 	s0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s2, s26, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s4, s28, alphaV0
+	str 	s4, [pCRow2]
+
+
+	fmul	s6, s30, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s28, wzr
+	fmov		s29, s16
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v8.4s}, [pB], #16
+	ld1	{v0.4s, v1.4s}, [pA], #32
+
+	ldr	d9, [pB], #8
+	ldr	d2, [pA], #8
+	ldr	d3, [pA, #8]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	ldr	x25, [pB], #8
+	fmul	v17.4s, v1.4s, v8.s[0]
+	ldr	x22, [pA], #16
+	fmul	v20.4s, v0.4s, v8.s[1]
+	ldr	x23, [pA], #8
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	fmul	v25.4s, v1.4s, v8.s[2]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	fmul	v29.4s, v1.4s, v8.s[3]
+.endm
+
+.macro KERNEL8x4_M1
+	ldr	d9, [pB], #8
+	fmov	v8.d[1], x24
+	ldr	d2, [pA], #8
+	fmov	v0.d[1], x20
+	ldr	d3, [pA, #8]
+	fmov	v1.d[1], x21
+	fmla	v16.4s, v0.4s, v8.s[0]
+	ldr	x25, [pB], #8
+	fmla	v17.4s, v1.4s, v8.s[0]
+	ldr	x22, [pA], #16
+	fmla	v20.4s, v0.4s, v8.s[1]
+	ldr	x23, [pA], #8
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v8.s[2]
+	fmla	v25.4s, v1.4s, v8.s[2]
+	fmla	v28.4s, v0.4s, v8.s[3]
+	fmla	v29.4s, v1.4s, v8.s[3]
+.endm
+
+.macro KERNEL8x4_M2
+	ldr	d8, [pB], #8
+	fmov	v9.d[1], x25
+	ldr	d0, [pA], #8
+	fmov	v2.d[1], x22
+	ldr	d1, [pA, #8]
+	fmov	v3.d[1], x23
+	fmla	v16.4s, v2.4s, v9.s[0]
+	ldr	x24, [pB], #8
+	fmla	v17.4s, v3.4s, v9.s[0]
+	ldr	x20, [pA], #16
+	fmla	v20.4s, v2.4s, v9.s[1]
+	ldr	x21, [pA], #8
+	fmla	v21.4s, v3.4s, v9.s[1]
+	fmla	v24.4s, v2.4s, v9.s[2]
+	fmla	v25.4s, v3.4s, v9.s[2]
+	fmla	v28.4s, v2.4s, v9.s[3]
+	fmla	v29.4s, v3.4s, v9.s[3]
+.endm
+
+.macro KERNEL8x4_E
+	fmov	v9.d[1], x25
+	fmov	v2.d[1], x22
+	fmov	v3.d[1], x23
+	fmla	v16.4s, v2.4s, v9.s[0]
+	fmla	v17.4s, v3.4s, v9.s[0]
+	fmla	v20.4s, v2.4s, v9.s[1]
+	fmla	v21.4s, v3.4s, v9.s[1]
+	fmla	v24.4s, v2.4s, v9.s[2]
+	fmla	v25.4s, v3.4s, v9.s[2]
+	fmla	v28.4s, v2.4s, v9.s[3]
+	fmla	v29.4s, v3.4s, v9.s[3]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v8.4s}, [pB], #16
+	ld1	{v0.4s, v1.4s}, [pA], #32
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v8.s[2]
+	fmla	v25.4s, v1.4s, v8.s[2]
+	fmla	v28.4s, v0.4s, v8.s[3]
+	fmla	v29.4s, v1.4s, v8.s[3]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
+
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
+
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
+
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.2s, v5.2s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
+
+	ld1	{v4.2s, v5.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
+
+	ld1	{v0.2s, v1.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
+
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
+
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
+
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
+
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
+
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
+
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE4x4
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2s, v24.2s, alphaV0
+	fmul	v9.2s, v25.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v28.2s, alphaV2
+	fmul	v13.2s, v29.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s20, s16
+	fmov		s24, s20
+	fmov		s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+.endm
+
+.macro SAVE2x4
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2s, v24.2s, alphaV2
+	st1	{v8.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v28.2s, alphaV3
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
+.endm
+
+.macro SAVE4x2
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+.endm
+
+.macro SAVE2x2
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+.endm
+
+.macro SAVE8x1
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd 	s16, s0, s8, s16  
+.endm
+
+.macro SAVE1x1
+
+	fmul	s8, s16, alpha0
+	str 	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+.Lstrmm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Lstrmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lstrmm_kernel_L8_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L8_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	.Lstrmm_kernel_L8_M4_BEGIN
+
+.Lstrmm_kernel_L8_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 16 to do?
+	blt	.Lstrmm_kernel_L8_M8_32
+
+	KERNEL8x8_I				// do one in the K
+	KERNEL8x8_M2				// do another in the K
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #2
+	ble	.Lstrmm_kernel_L8_M8_22a
+	.align 5
+
+.Lstrmm_kernel_L8_M8_22:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M8_22
+
+.Lstrmm_kernel_L8_M8_22a:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	 .Lstrmm_kernel_L8_M8_44
+
+.Lstrmm_kernel_L8_M8_32:
+
+	tst	counterL, #1
+	ble	.Lstrmm_kernel_L8_M8_40
+
+	KERNEL8x8_I
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	.Lstrmm_kernel_L8_M8_44
+
+.Lstrmm_kernel_L8_M8_40:
+
+	INIT8x8
+
+.Lstrmm_kernel_L8_M8_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lstrmm_kernel_L8_M8_100
+
+.Lstrmm_kernel_L8_M8_46:
+
+	KERNEL8x8_SUB
+
+	subs	counterL, counterL, 1
+	bgt	.Lstrmm_kernel_L8_M8_46
+
+.Lstrmm_kernel_L8_M8_100:
+
+	SAVE8x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+.Lstrmm_kernel_L8_M8_END:
+	subs	counterI, counterI, #1
+	bne	.Lstrmm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lstrmm_kernel_L8_END
+
+	tst	counterI, #4
+	ble	.Lstrmm_kernel_L8_M2_BEGIN
+
+.Lstrmm_kernel_L8_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lstrmm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lstrmm_kernel_L8_M4_22a
+	.align 5
+
+.Lstrmm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M4_22
+
+.Lstrmm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 .Lstrmm_kernel_L8_M4_44
+
+.Lstrmm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	.Lstrmm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+	KERNEL4x8_E
+
+	b	.Lstrmm_kernel_L8_M4_44
+
+.Lstrmm_kernel_L8_M4_40:
+
+	INIT4x8
+
+.Lstrmm_kernel_L8_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	.Lstrmm_kernel_L8_M4_100
+
+.Lstrmm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+.Lstrmm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+.Lstrmm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lstrmm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lstrmm_kernel_L8_M1_BEGIN
+
+.Lstrmm_kernel_L8_M2_20:
+
+	INIT2x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L8_M2_40
+
+.Lstrmm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M2_22
+
+
+.Lstrmm_kernel_L8_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L8_M2_100
+
+.Lstrmm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M2_42
+
+.Lstrmm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+.Lstrmm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lstrmm_kernel_L8_END
+
+.Lstrmm_kernel_L8_M1_20:
+
+	INIT1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L8_M1_40
+
+.Lstrmm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M1_22
+
+
+.Lstrmm_kernel_L8_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L8_M1_100
+
+.Lstrmm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_M1_42
+
+.Lstrmm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+.Lstrmm_kernel_L8_END:
+	lsl	temp, origK, #5			// B = B + K * 4 * 8
+	add	origPB, origPB, temp
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lstrmm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lstrmm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	.Lstrmm_kernel_L999
+
+	tst	counterJ , #4
+	ble	.Lstrmm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	.Lstrmm_kernel_L4_M4_BEGIN
+
+.Lstrmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lstrmm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lstrmm_kernel_L4_M8_22a
+	.align 5
+
+.Lstrmm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M8_22
+
+.Lstrmm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 .Lstrmm_kernel_L4_M8_44
+
+.Lstrmm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	.Lstrmm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	.Lstrmm_kernel_L4_M8_44
+
+.Lstrmm_kernel_L4_M8_40:
+
+	INIT8x4
+
+.Lstrmm_kernel_L4_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	.Lstrmm_kernel_L4_M8_100
+
+.Lstrmm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+.Lstrmm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+.Lstrmm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	.Lstrmm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lstrmm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	.Lstrmm_kernel_L4_M2_BEGIN
+
+.Lstrmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lstrmm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	.Lstrmm_kernel_L4_M4_22a
+	.align 5
+
+.Lstrmm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M4_22
+
+.Lstrmm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 .Lstrmm_kernel_L4_M4_44
+
+.Lstrmm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	.Lstrmm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	.Lstrmm_kernel_L4_M4_44
+
+.Lstrmm_kernel_L4_M4_40:
+
+	INIT4x4
+
+.Lstrmm_kernel_L4_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	.Lstrmm_kernel_L4_M4_100
+
+.Lstrmm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+.Lstrmm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+.Lstrmm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lstrmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lstrmm_kernel_L4_M1_BEGIN
+
+.Lstrmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L4_M2_40
+
+.Lstrmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M2_22
+
+
+.Lstrmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L4_M2_100
+
+.Lstrmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M2_42
+
+.Lstrmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+.Lstrmm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lstrmm_kernel_L4_END
+
+.Lstrmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L4_M1_40
+
+.Lstrmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M1_22
+
+
+.Lstrmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L4_M1_100
+
+.Lstrmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_M1_42
+
+.Lstrmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+.Lstrmm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lstrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lstrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lstrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI,#0
+	ble	.Lstrmm_kernel_L2_M4_BEGIN
+
+.Lstrmm_kernel_L2_M8_20:
+
+	INIT8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lstrmm_kernel_L2_M8_40
+	.align 5
+
+.Lstrmm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M8_22
+
+
+.Lstrmm_kernel_L2_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L2_M8_100
+
+.Lstrmm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M8_42
+
+.Lstrmm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+.Lstrmm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lstrmm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lstrmm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	.Lstrmm_kernel_L2_M2_BEGIN
+
+.Lstrmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lstrmm_kernel_L2_M4_40
+	.align 5
+
+.Lstrmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M4_22
+
+
+.Lstrmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L2_M4_100
+
+.Lstrmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M4_42
+
+.Lstrmm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+.Lstrmm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lstrmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lstrmm_kernel_L2_M1_BEGIN
+
+.Lstrmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	.Lstrmm_kernel_L2_M2_40
+
+.Lstrmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M2_22
+
+
+.Lstrmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L2_M2_100
+
+.Lstrmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M2_42
+
+.Lstrmm_kernel_L2_M2_100:
+
+	SAVE2x2
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+.Lstrmm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lstrmm_kernel_L2_END
+
+.Lstrmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	.Lstrmm_kernel_L2_M1_40
+
+.Lstrmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M1_22
+
+
+.Lstrmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L2_M1_100
+
+.Lstrmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_M1_42
+
+.Lstrmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+.Lstrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+.Lstrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lstrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3
+	cmp	counterI, #0
+	ble	.Lstrmm_kernel_L1_M4_BEGIN
+
+.Lstrmm_kernel_L1_M8_20:
+
+	INIT8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L1_M8_40
+	.align 5
+
+.Lstrmm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M8_22
+
+
+.Lstrmm_kernel_L1_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L1_M8_100
+
+.Lstrmm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M8_42
+
+.Lstrmm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+.Lstrmm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	.Lstrmm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	.Lstrmm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	.Lstrmm_kernel_L1_M2_BEGIN
+
+.Lstrmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L1_M4_40
+	.align 5
+
+.Lstrmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M4_22
+
+
+.Lstrmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L1_M4_100
+
+.Lstrmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M4_42
+
+.Lstrmm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+.Lstrmm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	.Lstrmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	.Lstrmm_kernel_L1_M1_BEGIN
+
+.Lstrmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L1_M2_40
+
+.Lstrmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M2_22
+
+
+.Lstrmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L1_M2_100
+
+.Lstrmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M2_42
+
+.Lstrmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+.Lstrmm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	.Lstrmm_kernel_L1_END
+
+.Lstrmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lstrmm_kernel_L1_M1_40
+
+.Lstrmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M1_22
+
+
+.Lstrmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lstrmm_kernel_L1_M1_100
+
+.Lstrmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_M1_42
+
+.Lstrmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+.Lstrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/common_param.h b/kernel/common_param.h
deleted file mode 100644
index 29bb65e5c..000000000
--- a/kernel/common_param.h
+++ /dev/null
@@ -1,1403 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#ifndef COMMON_PARAM_H
-#define COMMON_PARAM_H
-
-#ifndef ASSEMBLER
-
-#ifdef DYNAMIC_ARCH
-
-typedef struct {
-  int dtb_entries;
-  int offsetA, offsetB, align;
-
-#if 1
-  int shgemm_p, shgemm_q, shgemm_r;
-  int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
-
-  float  (*shamax_k) (BLASLONG, float *, BLASLONG);
-  float  (*shamin_k) (BLASLONG, float *, BLASLONG);
-  float  (*shmax_k)  (BLASLONG, float *, BLASLONG);
-  float  (*shmin_k)  (BLASLONG, float *, BLASLONG);
-BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG);
-BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG);
-BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG);
-BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG);
-
-  float  (*shnrm2_k) (BLASLONG, float *, BLASLONG);
-  float  (*shasum_k) (BLASLONG, float *, BLASLONG);
-  float  (*shsum_k)  (BLASLONG, float *, BLASLONG);
-  int    (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  float  (*shdot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-
-  int    (*shrot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
-
-  int    (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-
-  int    (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*shger_k)  (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*shsymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*shsymv_U) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-
-  int    (*shgemm_kernel   )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
-  int    (*shgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
-
-  int    (*shgemm_incopy   )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
-  int    (*shgemm_itcopy   )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
-  int    (*shgemm_oncopy   )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
-  int    (*shgemm_otcopy   )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
-
-  int    (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-
-  int    (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int	 (*shneg_tcopy)   (BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
-
-#endif
-  int sgemm_p, sgemm_q, sgemm_r;
-  int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
-
-  int exclusive_cache;
-
-  float  (*samax_k) (BLASLONG, float *, BLASLONG);
-  float  (*samin_k) (BLASLONG, float *, BLASLONG);
-  float  (*smax_k)  (BLASLONG, float *, BLASLONG);
-  float  (*smin_k)  (BLASLONG, float *, BLASLONG);
-BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG);
-BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG);
-BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG);
-BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
-
-  float  (*snrm2_k) (BLASLONG, float *, BLASLONG);
-  float  (*sasum_k) (BLASLONG, float *, BLASLONG);
-  float  (*ssum_k)  (BLASLONG, float *, BLASLONG);
-  int    (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  float  (*sdot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-
-  int    (*srot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
-
-  int    (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-
-  int    (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*sger_k)  (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*ssymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*ssymv_U) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-
-  int    (*sgemm_kernel   )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
-  int    (*sgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float  *, BLASLONG);
-
-
-  int    (*sgemm_incopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*sgemm_itcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*sgemm_oncopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*sgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-
-  int    (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int	 (*sneg_tcopy)   (BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
-
-  int dgemm_p, dgemm_q, dgemm_r;
-  int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn;
-
-  double (*damax_k) (BLASLONG, double *, BLASLONG);
-  double (*damin_k) (BLASLONG, double *, BLASLONG);
-  double (*dmax_k)  (BLASLONG, double *, BLASLONG);
-  double (*dmin_k)  (BLASLONG, double *, BLASLONG);
-BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG);
-BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG);
-BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG);
-BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
-
-  double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
-  double (*dasum_k) (BLASLONG, double *, BLASLONG);
-  double (*dsum_k)  (BLASLONG, double *, BLASLONG);
-  int    (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  double (*ddot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*drot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
-
-  int    (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-
-  int    (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*dger_k)  (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-
-  int    (*dsymv_L) (BLASLONG, BLASLONG, double,  double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*dsymv_U) (BLASLONG, BLASLONG, double,  double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-
-  int    (*dgemm_kernel   )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
-  int    (*dgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double  *, BLASLONG);
-
-  int    (*dgemm_incopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*dgemm_itcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*dgemm_oncopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*dgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
-  int    (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-
-  int    (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-
-  int    (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-
-  int    (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int    (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int	 (*dneg_tcopy)   (BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *);
-
-#ifdef EXPRECISION
-
-  int qgemm_p, qgemm_q, qgemm_r;
-  int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn;
-
- xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG);
- xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG);
- xdouble (*qmax_k)  (BLASLONG, xdouble *, BLASLONG);
- xdouble (*qmin_k)  (BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
-
- xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
- xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
- xdouble (*qsum_k)  (BLASLONG, xdouble *, BLASLONG);
-  int    (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
- xdouble (*qdot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*qrot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
-
-  int    (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-
-  int    (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qger_k)  (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
-  int    (*qsymv_L) (BLASLONG, BLASLONG, xdouble,  xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*qsymv_U) (BLASLONG, BLASLONG, xdouble,  xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-
-  int    (*qgemm_kernel   )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-  int    (*qgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble  *, BLASLONG);
-
-  int    (*qgemm_incopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qgemm_itcopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qgemm_oncopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qgemm_otcopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
-  int    (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
-  int    (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
-  int    (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int	 (*qneg_tcopy)   (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *);
-
-#endif
-
-  int cgemm_p, cgemm_q, cgemm_r;
-  int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn;
-
-  float (*camax_k) (BLASLONG, float *, BLASLONG);
-  float (*camin_k) (BLASLONG, float *, BLASLONG);
-BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
-BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
-
-  float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
-  float (*casum_k) (BLASLONG, float *, BLASLONG);
-  float (*csum_k)  (BLASLONG, float *, BLASLONG);
-  int    (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
-
-  int    (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-  int    (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-
-  int    (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*csymv_L) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*csymv_U) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*chemv_L) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*chemv_U) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*chemv_M) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-  int    (*chemv_V) (BLASLONG, BLASLONG, float,  float, float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
-
-  int    (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
-  int    (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
-  int    (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
-  int    (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
-  int    (*cgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float  *, BLASLONG);
-
-  int    (*cgemm_incopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm_itcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm_oncopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-  int    (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
-
-  int    (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-  int    (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-
-  int    (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int cgemm3m_p, cgemm3m_q, cgemm3m_r;
-  int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn;
-
-  int    (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
-
-  int    (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
-  int    (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-  int    (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-  int    (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-  int    (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-  int    (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-  int    (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
-
-  int    (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-
-  int    (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-  int    (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
-  int    (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-  int    (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-
-  int	 (*cneg_tcopy)   (BLASLONG, BLASLONG, float *, BLASLONG, float *);
-  int    (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
-
-  int zgemm_p, zgemm_q, zgemm_r;
-  int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn;
-
-  double (*zamax_k) (BLASLONG, double *, BLASLONG);
-  double (*zamin_k) (BLASLONG, double *, BLASLONG);
-BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG);
-BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
-
-  double (*znrm2_k) (BLASLONG, double *, BLASLONG);
-  double (*zasum_k) (BLASLONG, double *, BLASLONG);
-  double (*zsum_k)  (BLASLONG, double *, BLASLONG);
-  int    (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
-
-  int    (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-  int    (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-
-  int    (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
-
-  int    (*zsymv_L) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*zsymv_U) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*zhemv_L) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*zhemv_U) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*zhemv_M) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-  int    (*zhemv_V) (BLASLONG, BLASLONG, double,  double, double  *, BLASLONG, double  *, BLASLONG, double  *, BLASLONG, double *);
-
-  int    (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
-  int    (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
-  int    (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
-  int    (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
-  int    (*zgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double  *, BLASLONG);
-
-  int    (*zgemm_incopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm_itcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm_oncopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
-  int    (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-
-  int    (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-  int    (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
-
-  int    (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-  int    (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-
-  int    (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int    (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int    (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int zgemm3m_p, zgemm3m_q, zgemm3m_r;
-  int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn;
-
-  int    (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
-
-  int    (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
-  int    (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-  int    (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-  int    (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-  int    (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-  int    (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-  int    (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
-
-  int    (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int    (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-
-  int    (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-  int    (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
-  int    (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-  int    (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-
-  int	 (*zneg_tcopy)   (BLASLONG, BLASLONG, double *, BLASLONG, double *);
-  int    (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *);
-
-#ifdef EXPRECISION
-
-  int xgemm_p, xgemm_q, xgemm_r;
-  int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn;
-
-  xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG);
-  xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG);
-BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
-
-  xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
-  xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
-  xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
-  int    (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
-
-  int    (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-  int    (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-
-  int    (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
-  int    (*xsymv_L) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*xsymv_U) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*xhemv_L) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*xhemv_U) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*xhemv_M) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-  int    (*xhemv_V) (BLASLONG, BLASLONG, xdouble,  xdouble, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble  *, BLASLONG, xdouble *);
-
-  int    (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-  int    (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-  int    (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-  int    (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-  int    (*xgemm_beta     )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble  *, BLASLONG);
-
-  int    (*xgemm_incopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm_itcopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm_oncopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm_otcopy   )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
-  int    (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
-  int    (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-  int    (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
-  int    (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int xgemm3m_p, xgemm3m_q, xgemm3m_r;
-  int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn;
-
-  int    (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
-
-  int    (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
-  int    (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
-
-  int    (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-
-  int    (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-  int    (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
-  int    (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-  int    (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-
-  int	 (*xneg_tcopy)   (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-  int    (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *);
-
-#endif
-
-
-  void (*init)(void);
-
-  int snum_opt, dnum_opt, qnum_opt;
-
-  int    (*saxpby_k)     (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
-  int    (*daxpby_k)     (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
-  int    (*caxpby_k)     (BLASLONG, float, float,  float*, BLASLONG,float,float, float*, BLASLONG);
-  int    (*zaxpby_k)     (BLASLONG, double, double,  double*, BLASLONG,double,double, double*, BLASLONG);
-
-  int    (*somatcopy_k_cn)	(BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*somatcopy_k_ct)	(BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*somatcopy_k_rn)	(BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*somatcopy_k_rt)	(BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
-
-  int    (*domatcopy_k_cn)	(BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*domatcopy_k_ct)	(BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*domatcopy_k_rn)	(BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*domatcopy_k_rt)	(BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
-
-  int    (*comatcopy_k_cn)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_ct)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_rn)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_rt)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-
-  int    (*comatcopy_k_cnc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_ctc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_rnc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-  int    (*comatcopy_k_rtc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
-
-  int    (*zomatcopy_k_cn)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_ct)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_rn)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_rt)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-
-  int    (*zomatcopy_k_cnc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_ctc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_rnc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-  int    (*zomatcopy_k_rtc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
-
-  int    (*simatcopy_k_cn)	(BLASLONG, BLASLONG, float, float*, BLASLONG);
-  int    (*simatcopy_k_ct)	(BLASLONG, BLASLONG, float, float*, BLASLONG);
-  int    (*simatcopy_k_rn)	(BLASLONG, BLASLONG, float, float*, BLASLONG);
-  int    (*simatcopy_k_rt)	(BLASLONG, BLASLONG, float, float*, BLASLONG);
-
-  int    (*dimatcopy_k_cn)	(BLASLONG, BLASLONG, double, double*, BLASLONG);
-  int    (*dimatcopy_k_ct)	(BLASLONG, BLASLONG, double, double*, BLASLONG);
-  int    (*dimatcopy_k_rn)	(BLASLONG, BLASLONG, double, double*, BLASLONG);
-  int    (*dimatcopy_k_rt)	(BLASLONG, BLASLONG, double, double*, BLASLONG);
-
-  int    (*cimatcopy_k_cn)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_ct)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_rn)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_rt)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-
-  int    (*cimatcopy_k_cnc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_ctc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_rnc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-  int    (*cimatcopy_k_rtc)	(BLASLONG, BLASLONG, float, float, float*, BLASLONG);
-
-  int    (*zimatcopy_k_cn)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_ct)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_rn)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_rt)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-
-  int    (*zimatcopy_k_cnc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_ctc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_rnc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-  int    (*zimatcopy_k_rtc)	(BLASLONG, BLASLONG, double, double, double*, BLASLONG);
-
-  int    (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); 
-  int    (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); 
-  int    (*cgeadd_k) (BLASLONG, BLASLONG, float, float,  float *,  BLASLONG, float, float, float *, BLASLONG); 
-  int    (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); 
-
-} gotoblas_t;
-
-extern gotoblas_t *gotoblas;
-
-#define DTB_ENTRIES  gotoblas -> dtb_entries
-#define GEMM_OFFSET_A	gotoblas -> offsetA
-#define GEMM_OFFSET_B	gotoblas -> offsetB
-#define GEMM_ALIGN	gotoblas -> align
-
-#define HAVE_EX_L2	gotoblas -> exclusive_cache
-
-#define	SHGEMM_P		gotoblas -> shgemm_p
-#define	SHGEMM_Q		gotoblas -> shgemm_q
-#define	SHGEMM_R		gotoblas -> shgemm_r
-#define	SHGEMM_UNROLL_M	gotoblas -> shgemm_unroll_m
-#define	SHGEMM_UNROLL_N	gotoblas -> shgemm_unroll_n
-#define	SHGEMM_UNROLL_MN	gotoblas -> shgemm_unroll_mn
-
-#define	SGEMM_P		gotoblas -> sgemm_p
-#define	SGEMM_Q		gotoblas -> sgemm_q
-#define	SGEMM_R		gotoblas -> sgemm_r
-#define	SGEMM_UNROLL_M	gotoblas -> sgemm_unroll_m
-#define	SGEMM_UNROLL_N	gotoblas -> sgemm_unroll_n
-#define SGEMM_UNROLL_MN	gotoblas -> sgemm_unroll_mn
-
-#define	DGEMM_P		gotoblas -> dgemm_p
-#define	DGEMM_Q		gotoblas -> dgemm_q
-#define	DGEMM_R		gotoblas -> dgemm_r
-#define	DGEMM_UNROLL_M	gotoblas -> dgemm_unroll_m
-#define	DGEMM_UNROLL_N	gotoblas -> dgemm_unroll_n
-#define DGEMM_UNROLL_MN	gotoblas -> dgemm_unroll_mn
-
-#define	QGEMM_P		gotoblas -> qgemm_p
-#define	QGEMM_Q		gotoblas -> qgemm_q
-#define	QGEMM_R		gotoblas -> qgemm_r
-#define	QGEMM_UNROLL_M	gotoblas -> qgemm_unroll_m
-#define	QGEMM_UNROLL_N	gotoblas -> qgemm_unroll_n
-#define QGEMM_UNROLL_MN	gotoblas -> qgemm_unroll_mn
-
-#define	CGEMM_P		gotoblas -> cgemm_p
-#define	CGEMM_Q		gotoblas -> cgemm_q
-#define	CGEMM_R		gotoblas -> cgemm_r
-#define	CGEMM_UNROLL_M	gotoblas -> cgemm_unroll_m
-#define	CGEMM_UNROLL_N	gotoblas -> cgemm_unroll_n
-#define CGEMM_UNROLL_MN	gotoblas -> cgemm_unroll_mn
-
-#define	ZGEMM_P		gotoblas -> zgemm_p
-#define	ZGEMM_Q		gotoblas -> zgemm_q
-#define	ZGEMM_R		gotoblas -> zgemm_r
-#define	ZGEMM_UNROLL_M	gotoblas -> zgemm_unroll_m
-#define	ZGEMM_UNROLL_N	gotoblas -> zgemm_unroll_n
-#define ZGEMM_UNROLL_MN	gotoblas -> zgemm_unroll_mn
-
-#define	XGEMM_P		gotoblas -> xgemm_p
-#define	XGEMM_Q		gotoblas -> xgemm_q
-#define	XGEMM_R		gotoblas -> xgemm_r
-#define	XGEMM_UNROLL_M	gotoblas -> xgemm_unroll_m
-#define	XGEMM_UNROLL_N	gotoblas -> xgemm_unroll_n
-#define XGEMM_UNROLL_MN	gotoblas -> xgemm_unroll_mn
-
-#define	CGEMM3M_P		gotoblas -> cgemm3m_p
-#define	CGEMM3M_Q		gotoblas -> cgemm3m_q
-#define	CGEMM3M_R		gotoblas -> cgemm3m_r
-#define	CGEMM3M_UNROLL_M	gotoblas -> cgemm3m_unroll_m
-#define	CGEMM3M_UNROLL_N	gotoblas -> cgemm3m_unroll_n
-#define CGEMM3M_UNROLL_MN	gotoblas -> cgemm3m_unroll_mn
-
-#define	ZGEMM3M_P		gotoblas -> zgemm3m_p
-#define	ZGEMM3M_Q		gotoblas -> zgemm3m_q
-#define	ZGEMM3M_R		gotoblas -> zgemm3m_r
-#define	ZGEMM3M_UNROLL_M	gotoblas -> zgemm3m_unroll_m
-#define	ZGEMM3M_UNROLL_N	gotoblas -> zgemm3m_unroll_n
-#define ZGEMM3M_UNROLL_MN	gotoblas -> zgemm3m_unroll_mn
-
-#define	XGEMM3M_P		gotoblas -> xgemm3m_p
-#define	XGEMM3M_Q		gotoblas -> xgemm3m_q
-#define	XGEMM3M_R		gotoblas -> xgemm3m_r
-#define	XGEMM3M_UNROLL_M	gotoblas -> xgemm3m_unroll_m
-#define	XGEMM3M_UNROLL_N	gotoblas -> xgemm3m_unroll_n
-#define XGEMM3M_UNROLL_MN	gotoblas -> xgemm3m_unroll_mn
-
-#else
-
-#define DTB_ENTRIES  DTB_DEFAULT_ENTRIES
-
-#define GEMM_OFFSET_A	GEMM_DEFAULT_OFFSET_A
-#define GEMM_OFFSET_B	GEMM_DEFAULT_OFFSET_B
-#define GEMM_ALIGN	GEMM_DEFAULT_ALIGN
-
-#ifdef HAVE_EXCLUSIVE_CACHE
-#define HAVE_EX_L2	1
-#else
-#define HAVE_EX_L2	0
-#endif
-
-#define	SHGEMM_P		SHGEMM_DEFAULT_P
-#define	SHGEMM_Q		SHGEMM_DEFAULT_Q
-#define	SHGEMM_R		SHGEMM_DEFAULT_R
-#define SHGEMM_UNROLL_M	SHGEMM_DEFAULT_UNROLL_M
-#define SHGEMM_UNROLL_N	SHGEMM_DEFAULT_UNROLL_N
-#ifdef  SHGEMM_DEFAULT_UNROLL_MN
-#define SHGEMM_UNROLL_MN	SHGEMM_DEFAULT_UNROLL_MN
-#else
-#define SHGEMM_UNROLL_MN	MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N))
-#endif
-
-#define	SGEMM_P		SGEMM_DEFAULT_P
-#define	SGEMM_Q		SGEMM_DEFAULT_Q
-#define	SGEMM_R		SGEMM_DEFAULT_R
-#define SGEMM_UNROLL_M	SGEMM_DEFAULT_UNROLL_M
-#define SGEMM_UNROLL_N	SGEMM_DEFAULT_UNROLL_N
-#ifdef  SGEMM_DEFAULT_UNROLL_MN
-#define SGEMM_UNROLL_MN	SGEMM_DEFAULT_UNROLL_MN
-#else
-#define SGEMM_UNROLL_MN	MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N))
-#endif
-
-#define	DGEMM_P		DGEMM_DEFAULT_P
-#define	DGEMM_Q		DGEMM_DEFAULT_Q
-#define	DGEMM_R		DGEMM_DEFAULT_R
-#define DGEMM_UNROLL_M	DGEMM_DEFAULT_UNROLL_M
-#define DGEMM_UNROLL_N	DGEMM_DEFAULT_UNROLL_N
-#ifdef  DGEMM_DEFAULT_UNROLL_MN
-#define DGEMM_UNROLL_MN	DGEMM_DEFAULT_UNROLL_MN
-#else
-#define DGEMM_UNROLL_MN	MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N))
-#endif
-
-#define	QGEMM_P		QGEMM_DEFAULT_P
-#define	QGEMM_Q		QGEMM_DEFAULT_Q
-#define	QGEMM_R		QGEMM_DEFAULT_R
-#define QGEMM_UNROLL_M	QGEMM_DEFAULT_UNROLL_M
-#define QGEMM_UNROLL_N	QGEMM_DEFAULT_UNROLL_N
-#define QGEMM_UNROLL_MN	MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N))
-
-#define	CGEMM_P		CGEMM_DEFAULT_P
-#define	CGEMM_Q		CGEMM_DEFAULT_Q
-#define	CGEMM_R		CGEMM_DEFAULT_R
-#define CGEMM_UNROLL_M	CGEMM_DEFAULT_UNROLL_M
-#define CGEMM_UNROLL_N	CGEMM_DEFAULT_UNROLL_N
-#ifdef  CGEMM_DEFAULT_UNROLL_MN
-#define CGEMM_UNROLL_MN	CGEMM_DEFAULT_UNROLL_MN
-#else
-#define CGEMM_UNROLL_MN	MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N))
-#endif
-
-#define	ZGEMM_P		ZGEMM_DEFAULT_P
-#define	ZGEMM_Q		ZGEMM_DEFAULT_Q
-#define	ZGEMM_R		ZGEMM_DEFAULT_R
-#define ZGEMM_UNROLL_M	ZGEMM_DEFAULT_UNROLL_M
-#define ZGEMM_UNROLL_N	ZGEMM_DEFAULT_UNROLL_N
-#ifdef  ZGEMM_DEFAULT_UNROLL_MN
-#define ZGEMM_UNROLL_MN	ZGEMM_DEFAULT_UNROLL_MN
-#else
-#define ZGEMM_UNROLL_MN	MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
-#endif
-
-#define	XGEMM_P		XGEMM_DEFAULT_P
-#define	XGEMM_Q		XGEMM_DEFAULT_Q
-#define	XGEMM_R		XGEMM_DEFAULT_R
-#define XGEMM_UNROLL_M	XGEMM_DEFAULT_UNROLL_M
-#define XGEMM_UNROLL_N	XGEMM_DEFAULT_UNROLL_N
-#define XGEMM_UNROLL_MN	MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N))
-
-#ifdef CGEMM3M_DEFAULT_UNROLL_N
-
-#define	CGEMM3M_P		CGEMM3M_DEFAULT_P
-#define	CGEMM3M_Q		CGEMM3M_DEFAULT_Q
-#define	CGEMM3M_R		CGEMM3M_DEFAULT_R
-#define CGEMM3M_UNROLL_M	CGEMM3M_DEFAULT_UNROLL_M
-#define CGEMM3M_UNROLL_N	CGEMM3M_DEFAULT_UNROLL_N
-#define CGEMM3M_UNROLL_MN	MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N))
-
-#else
-
-#define	CGEMM3M_P		SGEMM_DEFAULT_P
-#define	CGEMM3M_Q		SGEMM_DEFAULT_Q
-#define	CGEMM3M_R		SGEMM_DEFAULT_R
-#define CGEMM3M_UNROLL_M	SGEMM_DEFAULT_UNROLL_M
-#define CGEMM3M_UNROLL_N	SGEMM_DEFAULT_UNROLL_N
-#define CGEMM3M_UNROLL_MN	MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N))
-
-#endif
-
-
-#ifdef ZGEMM3M_DEFAULT_UNROLL_N
-
-#define	ZGEMM3M_P		ZGEMM3M_DEFAULT_P
-#define	ZGEMM3M_Q		ZGEMM3M_DEFAULT_Q
-#define	ZGEMM3M_R		ZGEMM3M_DEFAULT_R
-#define ZGEMM3M_UNROLL_M	ZGEMM3M_DEFAULT_UNROLL_M
-#define ZGEMM3M_UNROLL_N	ZGEMM3M_DEFAULT_UNROLL_N
-#define ZGEMM3M_UNROLL_MN	MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
-
-#else
-
-#define	ZGEMM3M_P		DGEMM_DEFAULT_P
-#define	ZGEMM3M_Q		DGEMM_DEFAULT_Q
-#define	ZGEMM3M_R		DGEMM_DEFAULT_R
-#define ZGEMM3M_UNROLL_M	DGEMM_DEFAULT_UNROLL_M
-#define ZGEMM3M_UNROLL_N	DGEMM_DEFAULT_UNROLL_N
-#define ZGEMM3M_UNROLL_MN	MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
-
-#endif
-
-#define	XGEMM3M_P		QGEMM_DEFAULT_P
-#define	XGEMM3M_Q		QGEMM_DEFAULT_Q
-#define	XGEMM3M_R		QGEMM_DEFAULT_R
-#define XGEMM3M_UNROLL_M	QGEMM_DEFAULT_UNROLL_M
-#define XGEMM3M_UNROLL_N	QGEMM_DEFAULT_UNROLL_N
-#define XGEMM3M_UNROLL_MN	MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N))
-
-
-#endif
-#endif
-
-#ifndef COMPLEX
-#if   defined(XDOUBLE)
-#define GEMM_P			QGEMM_P
-#define GEMM_Q			QGEMM_Q
-#define GEMM_R			QGEMM_R
-#define GEMM_UNROLL_M		QGEMM_UNROLL_M
-#define GEMM_UNROLL_N		QGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		QGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		QGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		QGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		QGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	QGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	QGEMM_DEFAULT_UNROLL_N
-#elif defined(DOUBLE)
-#define GEMM_P			DGEMM_P
-#define GEMM_Q			DGEMM_Q
-#define GEMM_R			DGEMM_R
-#define GEMM_UNROLL_M		DGEMM_UNROLL_M
-#define GEMM_UNROLL_N		DGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		DGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		DGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		DGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		DGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	DGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	DGEMM_DEFAULT_UNROLL_N
-#elif defined(HALF)
-#define GEMM_P			SHGEMM_P
-#define GEMM_Q			SHGEMM_Q
-#define GEMM_R			SHGEMM_R
-#define GEMM_UNROLL_M		SHGEMM_UNROLL_M
-#define GEMM_UNROLL_N		SHGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		SHGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		SHGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		SHGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		SHGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	SHGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	SHGEMM_DEFAULT_UNROLL_N
-#else
-#define GEMM_P			SGEMM_P
-#define GEMM_Q			SGEMM_Q
-#define GEMM_R			SGEMM_R
-#define GEMM_UNROLL_M		SGEMM_UNROLL_M
-#define GEMM_UNROLL_N		SGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		SGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		SGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		SGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		SGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	SGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	SGEMM_DEFAULT_UNROLL_N
-#endif
-#else
-#if   defined(XDOUBLE)
-#define GEMM_P			XGEMM_P
-#define GEMM_Q			XGEMM_Q
-#define GEMM_R			XGEMM_R
-#define GEMM_UNROLL_M		XGEMM_UNROLL_M
-#define GEMM_UNROLL_N		XGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		XGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		XGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		XGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		XGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	XGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	XGEMM_DEFAULT_UNROLL_N
-#elif defined(DOUBLE)
-#define GEMM_P			ZGEMM_P
-#define GEMM_Q			ZGEMM_Q
-#define GEMM_R			ZGEMM_R
-#define GEMM_UNROLL_M		ZGEMM_UNROLL_M
-#define GEMM_UNROLL_N		ZGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		ZGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		ZGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		ZGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		ZGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	ZGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	ZGEMM_DEFAULT_UNROLL_N
-#else
-#define GEMM_P			CGEMM_P
-#define GEMM_Q			CGEMM_Q
-#define GEMM_R			CGEMM_R
-#define GEMM_UNROLL_M		CGEMM_UNROLL_M
-#define GEMM_UNROLL_N		CGEMM_UNROLL_N
-#define GEMM_UNROLL_MN		CGEMM_UNROLL_MN
-#define GEMM_DEFAULT_P		CGEMM_DEFAULT_P
-#define GEMM_DEFAULT_Q		CGEMM_DEFAULT_Q
-#define GEMM_DEFAULT_R		CGEMM_DEFAULT_R
-#define GEMM_DEFAULT_UNROLL_M	CGEMM_DEFAULT_UNROLL_M
-#define GEMM_DEFAULT_UNROLL_N	CGEMM_DEFAULT_UNROLL_N
-#endif
-#endif
-
-#ifdef XDOUBLE
-#define GEMM3M_UNROLL_M	XGEMM3M_UNROLL_M
-#define GEMM3M_UNROLL_N	XGEMM3M_UNROLL_N
-#elif defined(DOUBLE)
-#define GEMM3M_UNROLL_M	ZGEMM3M_UNROLL_M
-#define GEMM3M_UNROLL_N	ZGEMM3M_UNROLL_N
-#else
-#define GEMM3M_UNROLL_M	CGEMM3M_UNROLL_M
-#define GEMM3M_UNROLL_N	CGEMM3M_UNROLL_N
-#endif
-
-
-#ifndef QGEMM_DEFAULT_UNROLL_M
-#define QGEMM_DEFAULT_UNROLL_M 2
-#endif
-
-#ifndef QGEMM_DEFAULT_UNROLL_N
-#define QGEMM_DEFAULT_UNROLL_N 2
-#endif
-
-#ifndef XGEMM_DEFAULT_UNROLL_M
-#define XGEMM_DEFAULT_UNROLL_M 2
-#endif
-
-#ifndef XGEMM_DEFAULT_UNROLL_N
-#define XGEMM_DEFAULT_UNROLL_N 2
-#endif
-
-#ifndef GEMM_THREAD
-#define GEMM_THREAD gemm_thread_n
-#endif
-
-#ifndef SHGEMM_DEFAULT_R
-#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q *  4) - 15) & ~15UL)
-#endif
-
-#ifndef SGEMM_DEFAULT_R
-#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q *  4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q *  4) - 15) & ~15UL)
-#endif
-
-#ifndef DGEMM_DEFAULT_R
-#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q *  8) - 15) & ~15UL)
-#endif
-
-#ifndef QGEMM_DEFAULT_R
-#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
-#endif
-
-#ifndef CGEMM_DEFAULT_R
-#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q *  8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q *  8) - 15) & ~15UL)
-#endif
-
-#ifndef ZGEMM_DEFAULT_R
-#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
-#endif
-
-#ifndef XGEMM_DEFAULT_R
-#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL)
-#endif
-
-#ifndef SNUMOPT
-#define SNUMOPT		2
-#endif
-
-#ifndef DNUMOPT
-#define DNUMOPT		2
-#endif
-
-#ifndef QNUMOPT
-#define QNUMOPT		1
-#endif
-
-#ifndef GEMM3M_P
-#ifdef XDOUBLE
-#define GEMM3M_P	XGEMM3M_P
-#elif defined(DOUBLE)
-#define GEMM3M_P	ZGEMM3M_P
-#else
-#define GEMM3M_P	CGEMM3M_P
-#endif
-#endif
-
-#ifndef GEMM3M_Q
-#ifdef XDOUBLE
-#define GEMM3M_Q	XGEMM3M_Q
-#elif defined(DOUBLE)
-#define GEMM3M_Q	ZGEMM3M_Q
-#else
-#define GEMM3M_Q	CGEMM3M_Q
-#endif
-#endif
-
-#ifndef GEMM3M_R
-#ifdef XDOUBLE
-#define GEMM3M_R	XGEMM3M_R
-#elif defined(DOUBLE)
-#define GEMM3M_R	ZGEMM3M_R
-#else
-#define GEMM3M_R	CGEMM3M_R
-#endif
-#endif
-
-
-#endif
diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c
index 5f91d0dbe..d3ab46472 100644
--- a/kernel/generic/gemm_ncopy_16.c
+++ b/kernel/generic/gemm_ncopy_16.c
@@ -39,24 +39,24 @@
 #include <stdio.h>
 #include "common.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
   BLASLONG i, j;
 
-  FLOAT *aoffset;
-  FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
-  FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
-  FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
-  FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
+  IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
 
-  FLOAT *boffset;
-  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
-  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
-  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
-  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
-  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
-  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
-  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
 
   aoffset = a;
   boffset = b;
diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c
index a49a778e6..aaf9c8917 100644
--- a/kernel/generic/gemm_ncopy_8.c
+++ b/kernel/generic/gemm_ncopy_8.c
@@ -39,30 +39,30 @@
 #include <stdio.h>
 #include "common.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
   BLASLONG i, j;
 
-  FLOAT *aoffset;
-  FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
-  FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
 
-  FLOAT *boffset;
-  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
-  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
-  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
-  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
-  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
-  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
-  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
-  FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
-  FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
-  FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
-  FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
-  FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
-  FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
-  FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
-  FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+  IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
+  IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
+  IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
+  IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
+  IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
+  IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
+  IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
+  IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
 
 
   aoffset = a;
diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c
index 56268ebf2..14252599a 100644
--- a/kernel/generic/gemm_tcopy_16.c
+++ b/kernel/generic/gemm_tcopy_16.c
@@ -39,22 +39,22 @@
 #include <stdio.h>
 #include "common.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 
   BLASLONG i, j;
 
-  FLOAT *aoffset;
-  FLOAT *aoffset1, *aoffset2;
-  FLOAT *boffset;
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2;
+  IFLOAT *boffset;
 
-  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
-  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
-  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
-  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
-  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
-  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
-  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
 
   aoffset   = a;
   boffset   = b;
diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c
index b28f3d219..3e8a839db 100644
--- a/kernel/generic/gemm_tcopy_8.c
+++ b/kernel/generic/gemm_tcopy_8.c
@@ -39,32 +39,32 @@
 #include <stdio.h>
 #include "common.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 
   BLASLONG i, j;
 
-  FLOAT *aoffset;
-  FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
-  FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
 
-  FLOAT *boffset,  *boffset1, *boffset2, *boffset3, *boffset4;
+  IFLOAT *boffset,  *boffset1, *boffset2, *boffset3, *boffset4;
 
-  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
-  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
-  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
-  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
-  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
-  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
-  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
-  FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
-  FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
-  FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
-  FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
-  FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
-  FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
-  FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
-  FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+  IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
+  IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
+  IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
+  IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
+  IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
+  IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
+  IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
+  IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
 
   aoffset   = a;
   boffset   = b;
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
new file mode 100644
index 000000000..f390fac61
--- /dev/null
+++ b/kernel/power/KERNEL.POWER10
@@ -0,0 +1,225 @@
+ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
+include $(KERNELDIR)/KERNEL.POWER8
+else
+
+#SGEMM_BETA = ../generic/gemm_beta.c
+#DGEMM_BETA = ../generic/gemm_beta.c
+#CGEMM_BETA = ../generic/zgemm_beta.c
+#ZGEMM_BETA = ../generic/zgemm_beta.c
+
+SHGEMM_BETA = ../generic/gemm_beta.c
+SHGEMMKERNEL    = shgemm_kernel_power10.c
+SHGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SHGEMMITCOPY    = ../generic/gemm_tcopy_16.c
+SHGEMMONCOPY    = ../generic/gemm_ncopy_8.c
+SHGEMMOTCOPY    = ../generic/gemm_tcopy_8.c
+SHGEMMINCOPYOBJ =  shgemm_incopy$(TSUFFIX).$(SUFFIX)
+SHGEMMITCOPYOBJ =  shgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SHGEMMONCOPYOBJ =  shgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SHGEMMOTCOPYOBJ =  shgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMKERNEL	= sgemm_kernel_power10.c
+DTRMMKERNEL	= dgemm_kernel_power10.c
+CTRMMKERNEL	= cgemm_kernel_power10.S
+ZTRMMKERNEL	= zgemm_kernel_power10.S
+
+SGEMMKERNEL    =  sgemm_kernel_power10.c
+SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY    = sgemm_tcopy_16_power8.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_power10.c
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
+DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = cgemm_kernel_power10.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemm_kernel_power10.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+#CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+#ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+#SAMAXKERNEL  = ../arm/amax.c
+#DAMAXKERNEL  = ../arm/amax.c
+#CAMAXKERNEL  = ../arm/zamax.c
+#ZAMAXKERNEL  = ../arm/zamax.c
+#
+#SAMINKERNEL  = ../arm/amin.c
+#DAMINKERNEL  = ../arm/amin.c
+#CAMINKERNEL  = ../arm/zamin.c
+#ZAMINKERNEL  = ../arm/zamin.c
+#
+#SMAXKERNEL   = ../arm/max.c
+#DMAXKERNEL   = ../arm/max.c
+#
+#SMINKERNEL   = ../arm/min.c
+#DMINKERNEL   = ../arm/min.c
+#
+ifneq ($(GCCVERSIONGTEQ9),1)
+ISAMAXKERNEL = isamax_power9.S
+else
+ISAMAXKERNEL = isamax.c
+endif
+IDAMAXKERNEL = idamax.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+ICAMAXKERNEL = icamax_power9.S
+else
+ICAMAXKERNEL = icamax.c
+endif
+IZAMAXKERNEL = izamax.c
+#
+ifneq ($(GCCVERSIONGTEQ9),1)
+ISAMINKERNEL = isamin_power9.S
+else
+ISAMINKERNEL = isamin.c
+endif
+IDAMINKERNEL = idamin.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+ICAMINKERNEL = icamin_power9.S
+else
+ICAMINKERNEL = icamin.c
+endif
+IZAMINKERNEL = izamin.c
+#
+#ISMAXKERNEL  = ../arm/imax.c
+#IDMAXKERNEL  = ../arm/imax.c
+#
+#ISMINKERNEL  = ../arm/imin.c
+#IDMINKERNEL  = ../arm/imin.c
+#
+SASUMKERNEL  = sasum.c
+DASUMKERNEL  = dasum.c
+CASUMKERNEL  = casum.c
+ZASUMKERNEL  = zasum.c
+#
+SAXPYKERNEL  = saxpy.c
+DAXPYKERNEL  = daxpy.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+CAXPYKERNEL  = caxpy_power9.S
+else
+CAXPYKERNEL  = caxpy.c
+endif
+ZAXPYKERNEL  = zaxpy.c
+#
+SCOPYKERNEL  = scopy.c
+DCOPYKERNEL  = dcopy.c
+CCOPYKERNEL  = ccopy.c
+ZCOPYKERNEL  = zcopy.c
+#
+SDOTKERNEL   =  sdot.c
+DDOTKERNEL   =  ddot.c
+DSDOTKERNEL  =  sdot.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+CDOTKERNEL   =  cdot_power9.S
+else
+CDOTKERNEL   =  cdot.c
+endif
+ZDOTKERNEL   =  zdot.c
+#
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+#
+SROTKERNEL   = srot.c
+DROTKERNEL   = drot.c
+CROTKERNEL   = crot.c
+ZROTKERNEL   = zrot.c
+#
+SSCALKERNEL  = sscal.c
+DSCALKERNEL  = dscal.c
+CSCALKERNEL  = zscal.c
+ZSCALKERNEL  = zscal.c
+#
+SSWAPKERNEL  = sswap.c
+DSWAPKERNEL  = dswap.c
+CSWAPKERNEL  = cswap.c
+ZSWAPKERNEL  = zswap.c
+#
+
+SGEMVNKERNEL = sgemv_n.c
+DGEMVNKERNEL = dgemv_n_power10.c
+CGEMVNKERNEL = cgemv_n.c
+ZGEMVNKERNEL = zgemv_n_4.c
+#
+SGEMVTKERNEL = sgemv_t.c
+DGEMVTKERNEL = dgemv_t_power10.c
+CGEMVTKERNEL = cgemv_t.c
+ZGEMVTKERNEL = zgemv_t_4.c
+
+
+#SSYMV_U_KERNEL =  ../generic/symv_k.c
+#SSYMV_L_KERNEL =  ../generic/symv_k.c
+#DSYMV_U_KERNEL =  ../generic/symv_k.c
+#DSYMV_L_KERNEL =  ../generic/symv_k.c
+#QSYMV_U_KERNEL =  ../generic/symv_k.c
+#QSYMV_L_KERNEL =  ../generic/symv_k.c
+#CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+#ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+#ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+
+endif
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index c7867012b..c2f4cd204 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -1,3 +1,44 @@
+# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
+ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
+SGEMMKERNEL    =  gemm_kernel_power6.S
+SGEMMINCOPY    =
+SGEMMITCOPY    =
+SGEMMONCOPY    =  gemm_ncopy_4.S
+SGEMMOTCOPY    =  gemm_tcopy_4.S
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL    =  gemm_kernel_power6.S
+DGEMMINCOPY    =
+DGEMMITCOPY    =
+DGEMMONCOPY    =  gemm_ncopy_4.S
+DGEMMOTCOPY    =  gemm_tcopy_4.S
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL    =  zgemm_kernel_power6.S
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL    =  zgemm_kernel_power6.S
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+else
+
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
 #CGEMM_BETA = ../generic/zgemm_beta.c
@@ -12,7 +53,7 @@ SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
 
+ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
+DTRSMKERNEL_LN  =  trsm_kernel_power6_LN.S
+DTRSMKERNEL_LT  =  trsm_kernel_power6_LT.S
+DTRSMKERNEL_RN  =  trsm_kernel_power6_LT.S
+DTRSMKERNEL_RT  =  trsm_kernel_power6_RT.S
+else
 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+endif
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
@@ -153,6 +202,10 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
+#
+ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
+CAXPYKERNEL  = zaxpy.S
+else
 ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
 ifneq ($(GCCVERSIONGTEQ9),1)
 CAXPYKERNEL  = caxpy_power8.S
@@ -162,6 +215,7 @@ endif
 else
 CAXPYKERNEL  = caxpy.c
 endif
+endif
 #
 ZAXPYKERNEL  = zaxpy.c
 #
@@ -232,3 +286,10 @@ QCABS_KERNEL	= ../generic/cabs.c
 #Dump kernel
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+
+ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
+IDAMAXKERNEL  = ../arm/iamax.c
+IDAMINKERNEL  = ../arm/iamin.c
+IZAMAXKERNEL  = ../arm/izamax.c
+IZAMINKERNEL  = ../arm/izamin.c
+endif
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index aabb5d976..ab8fbfcd9 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -16,7 +16,7 @@ SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index f615754bb..54660b54d 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -20,8 +20,10 @@ ZAXPYKERNEL	= zaxpy_ppc440.S
 
 SDOTKERNEL	=  dot_ppc440.S
 DDOTKERNEL	=  dot_ppc440.S
-CDOTKERNEL	= zdot_ppc440.S
-ZDOTKERNEL	= zdot_ppc440.S
+#CDOTKERNEL	= zdot_ppc440.S
+#ZDOTKERNEL	= zdot_ppc440.S
+CDOTKERNEL	= ../arm/zdot.c
+ZDOTKERNEL	= ../arm/zdot.c
 
 ISAMAXKERNEL	=  iamax_ppc440.S
 IDAMAXKERNEL	=  iamax_ppc440.S
@@ -52,8 +54,11 @@ ZNRM2KERNEL	= znrm2_ppc440.S
 
 SROTKERNEL	=  rot_ppc440.S
 DROTKERNEL	=  rot_ppc440.S
-CROTKERNEL	= zrot_ppc440.S
-ZROTKERNEL	= zrot_ppc440.S
+#CROTKERNEL	= zrot_ppc440.S
+#ZROTKERNEL	= zrot_ppc440.S
+CROTKERNEL	= ../arm/zrot.c
+ZROTKERNEL	= ../arm/zrot.c
+
 
 SSCALKERNEL	=  scal_ppc440.S
 DSCALKERNEL	=  scal_ppc440.S
@@ -78,13 +83,18 @@ DGEMMINCOPYOBJ =
 DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL    =  zgemm_kernel_altivec_g4.S
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
+#CGEMMKERNEL    =  zgemm_kernel_altivec_g4.S
+#CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
+#CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
+CGEMMKERNEL    =  zgemm_kernel.S
+CGEMMINCOPY    =
+CGEMMONCOPY    =
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMINCOPYOBJ =  
+#cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  
+#cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMKERNEL    =  zgemm_kernel_g4.S
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
index a9ece0768..06982bfba 100644
--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8)  || defined(POWER9)
+#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "casum_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c
index 50df84cc5..5e58034dd 100644
--- a/kernel/power/ccopy.c
+++ b/kernel/power/ccopy.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "ccopy_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_32
 
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index d5b18729a..ef5e4710f 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zdot.c"
+#else
 
 #include "common.h"
 #ifndef HAVE_KERNEL_8
@@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
     return (result);
 
 }
+#endif
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
index 2bc99974f..6be8c128c 100644
--- a/kernel/power/cgemm_kernel_8x4_power8.S
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -424,7 +424,7 @@ L999:
 	lwz	r16,  204(SP)
 	lwz	r15,  208(SP)
 	lwz	r14,  212(SP)
-        addi    r11, 224
+        addi    r11, SP, 224
 #endif
         lvx     v20, r11, r0
         addi    r11, r11, 16
@@ -459,4 +459,4 @@ L999:
 	blr
 
 	EPILOGUE
-#endif^
+#endif
diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S
new file mode 100644
index 000000000..e04f948dd
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power10.S
@@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs51
+#define alpha_i vs55
+#define save_permute_1 vs59
+#define permute_mask vs63
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power10.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power10.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S
new file mode 100644
index 000000000..3700ac87b
--- /dev/null
+++ b/kernel/power/cgemm_logic_power10.S
@@ -0,0 +1,2814 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
+    vspltisb v24, -1
+    vspltisb v25, 0
+    xxsldwi vs57, vs56, vs57, 1
+    xxpermdi vs57, vs57, vs57, 3
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M, 3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S
new file mode 100644
index 000000000..b66e93405
--- /dev/null
+++ b/kernel/power/cgemm_macros_power10.S
@@ -0,0 +1,2131 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 8
+#define DISP32(ind, disp) (ind*unit_size*32+disp)
+#define DISP16(ind, disp) (ind*unit_size*16+disp)
+#define DISP8(ind, disp) (ind*unit_size*8+disp)
+#define DISP4(ind, disp) (ind*unit_size*4+disp)
+#define DISP2(ind, disp) (ind*unit_size*2+disp)
+#define DISP1(ind, disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro	AGGREGATE_REALS_IMAGES  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT)
+	xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#else	// CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+.macro	AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT)
+	xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#else	// CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR, VSINII, VSOUT1, VSOUT2
+	xvmulsp \VSOUT1, \VSINII, alpha_i
+	xvmulsp  \VSOUT2, \VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR, VSINII, VSOUT1, VSOUT2
+	xvmsubasp  \VSOUT1, \VSINRR, alpha_r
+	xvmaddasp \VSOUT2, \VSINII, alpha_r
+.endm
+
+.macro	PERMUTE1	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R1, \R2, vs57
+	xxsel	\OUT, \R3, \R4, vs57
+	xxpermdi	\OUT, \OUT, vs62, 1
+.endm
+.macro	PERMUTE2	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R2, \R1, vs57
+	xxsel	\OUT, \R4, \R3, vs57
+	xxpermdi	\OUT, vs62, \OUT, 1
+	xxperm	\OUT, \OUT, permute_mask
+.endm
+.macro PERMUTE3	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R1, \R2, vs57
+	xxsel	\OUT, \R3, \R4, vs57
+	xxpermdi \OUT, vs62, \OUT, 2
+.endm
+.macro PERMUTE4	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R2, \R1, vs57
+	xxsel	\OUT, \R4, \R3, vs57
+	xxpermdi	\OUT, \OUT, vs62, 2
+	xxperm	\OUT, \OUT, permute_mask
+.endm
+.macro	GROUP1
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	xxperm	vs8, vs36, permute_mask
+	xxperm	vs12, vs44, permute_mask
+	xxperm	vs9, vs37, permute_mask
+	xxperm	vs13, vs45, permute_mask
+.endm
+.macro	AGG_GROUP1
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES	vs36, vs8, vs44, vs12
+	AGGREGATE_REALS_IMAGES	vs37, vs9, vs45, vs13
+.endm
+.macro	GROUP2
+	xxperm	vs0, vs34, permute_mask
+	xxperm	vs4, vs42, permute_mask
+	xxperm	vs1, vs35, permute_mask
+	xxperm	vs5, vs43, permute_mask
+	xxperm	vs8, vs38, permute_mask
+	xxperm	vs12, vs46, permute_mask
+	xxperm	vs9, vs39, permute_mask
+	xxperm	vs13, vs47, permute_mask
+.endm
+.macro	AGG_GROUP2
+	AGGREGATE_REALS_IMAGES	vs34, vs0, vs42, vs4
+	AGGREGATE_REALS_IMAGES	vs35, vs1, vs43, vs5
+	AGGREGATE_REALS_IMAGES	vs38, vs8, vs46, vs12
+	AGGREGATE_REALS_IMAGES	vs39, vs9, vs47, vs13
+.endm
+.macro	MULTIPLY_GROUP1
+	MULT_APLHA_PART1	vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1	vs33, vs41, vs2, vs3
+	MULT_APLHA_PART1	vs36, vs44, vs8, vs9
+	MULT_APLHA_PART1	vs37, vs45, vs10, vs11
+	MULT_APLHA_PART2	vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2	vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2	vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2	vs37, vs45, vs10, vs11
+.endm
+.macro	MULTIPLY_GROUP2
+	MULT_APLHA_PART1	vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1	vs35, vs43, vs6, vs7
+	MULT_APLHA_PART1	vs38, vs46, vs12, vs13
+	MULT_APLHA_PART1	vs39, vs47, vs14, vs15
+	MULT_APLHA_PART2	vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2	vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2	vs38, vs46, vs12, vs13
+	MULT_APLHA_PART2	vs39, vs47, vs14, vs15
+.endm
+/* reconstruct r, i pairs*/
+.macro	RECONSTRUCT_PAIR1
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs2, vs3, save_permute_1
+	xxperm	vs8, vs9, save_permute_1
+	xxperm	vs10, vs11, save_permute_1
+.endm
+.macro	RECONSTRUCT_PAIR2
+	xxperm	vs4, vs5, save_permute_1
+	xxperm	vs6, vs7, save_permute_1
+	xxperm	vs12, vs13, save_permute_1
+	xxperm	vs14, vs15, save_permute_1
+.endm
+.macro	SHUFFLE_ACC	ACC, R0, R1, R2, R3, O1, O2, O3, O4
+	xxmfacc	\ACC
+	PERMUTE1	\O1, \R3, \R2, \R1, \R0
+	PERMUTE2	\O2, \R1, \R0, \R3, \R2
+	PERMUTE3	\O3, \R1, \R0, \R3, \R2
+	PERMUTE4	\O4, \R3, \R2, \R1, \R0
+.endm
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+.macro	ZERO4x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+	xxsetaccz	4
+	xxsetaccz	5
+	xxsetaccz	6
+	xxsetaccz	7
+.endm
+
+.macro	LOAD4x8
+	LOAD4x8O	0, 0
+.endm
+
+.macro	LOAD4x8O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END4x8_NORMAL
+	END4x8	AO, BO, 64, 32
+.endm
+
+.macro	END4x8_WITHOUT_ADD
+	END4x8	AO, BO, 0, 0
+.endm
+
+.macro	END4x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	7, 36, 34
+	xvf32gerpp	6, 37, 34
+	xvf32gerpp	5, 32, 34
+	xvf32gerpp	4, 33, 34
+.endm
+
+.macro	LOAD4x8_2
+	LOAD4x8_2O	0, 0
+.endm
+
+.macro	LOAD4x8_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs38, (32+\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	lxvp	vs40, (64+\OffsetA)(AO)
+	lxvp	vs42, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END4x8_2
+	/*for load2 offset will be 128 and 64*/
+	KERNEL4x8_2	AO, BO, 128, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x8_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x8_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x8_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x8_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x8_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	7, 36, 34
+	xvf32gerpp	6, 37, 34
+	xvf32gerpp	5, 32, 34
+	xvf32gerpp	4, 33, 34
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	3, 42, 39
+	xvf32gerpp	2, 43, 39
+	xvf32gerpp	1, 40, 39
+	xvf32gerpp	0, 41, 39
+	xvf32gerpp	7, 42, 38
+	xvf32gerpp	6, 43, 38
+	xvf32gerpp	5, 40, 38
+	xvf32gerpp	4, 41, 38
+.if \Complete==0
+	lxvp	vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
+	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+	lxvp	vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\BREG, \BREG, DISP8(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi	\BREG, \BREG, DISP8(\Index, 64)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x8
+	LOAD4x8
+	END4x8	AO, BO, 64, 32
+.endm
+
+.macro SAVE4x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	SHUFFLE_ACC	4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
+	SHUFFLE_ACC	5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
+	SHUFFLE_ACC	7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
+	SHUFFLE_ACC	6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs28, 0(T1)
+#endif
+	xxperm	vs2, vs34, permute_mask
+	xxperm	vs6, vs42, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs30, 32(T1)
+#endif
+	xxperm	vs3, vs35, permute_mask
+	xxperm	vs7, vs43, permute_mask
+	add	T2, CO, T4
+	add	T3, T1, T4
+	GROUP1
+	AGG_GROUP1
+	AGGREGATE_REALS_IMAGES	vs34, vs2, vs42, vs6
+	xxperm	vs10, vs38, permute_mask
+	xxperm	vs14, vs46, permute_mask
+	AGGREGATE_REALS_IMAGES	vs35, vs3, vs43, vs7
+	xxperm	vs11, vs39, permute_mask
+	xxperm	vs15, vs47, permute_mask
+	xxperm	vs0, vs48, permute_mask
+	xxperm	vs4, vs56, permute_mask
+	xxperm	vs1, vs49, permute_mask
+	xxperm	vs5, vs16, permute_mask
+	AGGREGATE_REALS_IMAGES	vs38, vs10, vs46, vs14
+	xxperm	vs2, vs50, permute_mask
+	xxperm	vs6, vs58, permute_mask
+	AGGREGATE_REALS_IMAGES	vs39, vs11, vs47, vs15
+	xxperm	vs3, vs17, permute_mask
+	xxperm	vs7, vs19, permute_mask
+	AGGREGATE_REALS_IMAGES	vs48, vs0, vs56, vs4
+	xxperm	vs8, vs52, permute_mask
+	xxperm	vs12, vs60, permute_mask
+	AGGREGATE_REALS_IMAGES	vs49, vs1, vs16, vs5
+	xxperm	vs9, vs53, permute_mask
+	xxperm	vs13, vs61, permute_mask
+	AGGREGATE_REALS_IMAGES	vs50, vs2, vs58, vs6
+	xxperm	vs10, vs54, permute_mask
+	xxperm	vs14, vs21, permute_mask
+	AGGREGATE_REALS_IMAGES	vs17, vs3, vs19, vs7
+	xxperm	vs11, vs18, permute_mask
+	xxperm	vs15, vs20, permute_mask
+	AGGREGATE_REALS_IMAGES	vs52, vs8, vs60, vs12
+	AGGREGATE_REALS_IMAGES	vs53, vs9, vs61, vs13
+/*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	AGGREGATE_REALS_IMAGES	vs54, vs10, vs21, vs14
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	AGGREGATE_REALS_IMAGES	vs18, vs11, vs20, vs15
+	MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+#ifndef TRMMKERNEL
+	lxvp	vs32, 0(T2)
+#endif
+	MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART1    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+	lxvp	vs40, 32(T2)
+#endif
+	MULT_APLHA_PART1    vs38, vs46, vs12, vs13
+	MULT_APLHA_PART1    vs39, vs47, vs14, vs15
+#ifndef TRMMKERNEL
+	lxvp	vs34, 0(T3)
+#endif
+	MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+	lxvp	vs42, 32(T3)
+#endif
+	MULT_APLHA_PART2    vs38, vs46, vs12, vs13
+	MULT_APLHA_PART2    vs39, vs47, vs14, vs15
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+	/* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs26, vs26, vs7
+	xvaddsp	vs27, vs27, vs5
+	xvaddsp	vs28, vs28, vs11
+	xvaddsp	vs29, vs29, vs9
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs12, vs4, 2
+	xxpermdi	vs26, vs14, vs6, 2
+	xxpermdi	vs29, vs0, vs8, 2
+	xxpermdi	vs28, vs2, vs10, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	MULT_APLHA_PART1    vs48, vs56, vs0, vs1
+	MULT_APLHA_PART1    vs49, vs16, vs2, vs3
+	stxvp	vs26, 32(CO)
+	MULT_APLHA_PART1    vs50, vs58, vs4, vs5
+	MULT_APLHA_PART1    vs17, vs19, vs6, vs7
+	stxvp	vs28, 0(T1)
+	MULT_APLHA_PART2    vs48, vs56, vs0, vs1
+	MULT_APLHA_PART2    vs49, vs16, vs2, vs3
+	stxvp	vs30, 32(T1)
+	MULT_APLHA_PART2    vs50, vs58, vs4, vs5
+	MULT_APLHA_PART2    vs17, vs19, vs6, vs7
+	MULT_APLHA_PART1    vs52, vs60, vs8, vs9
+	MULT_APLHA_PART1    vs53, vs61, vs10, vs11
+	MULT_APLHA_PART1    vs54, vs21, vs12, vs13
+	MULT_APLHA_PART1    vs18, vs20, vs14, vs15
+	MULT_APLHA_PART2    vs52, vs60, vs8, vs9
+	MULT_APLHA_PART2    vs53, vs61, vs10, vs11
+	MULT_APLHA_PART2    vs54, vs21, vs12, vs13
+	MULT_APLHA_PART2    vs18, vs20, vs14, vs15
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs32, vs32, vs3
+	xvaddsp	vs33, vs33, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs40, vs40, vs7
+	xvaddsp vs41, vs41, vs5
+	xvaddsp	vs34, vs34, vs11
+	xvaddsp	vs35, vs35, vs9
+	xvaddsp	vs42, vs42, vs15
+	xvaddsp	vs43, vs43, vs13
+#else
+	xxpermdi	vs33, vs8, vs0, 2
+	xxpermdi	vs32, vs10, vs2, 2
+	xxpermdi	vs41, vs12, vs4, 2
+	xxpermdi	vs40, vs14, vs6, 2
+	xxpermdi	vs35, vs0, vs8, 2
+	xxpermdi	vs34, vs2, vs10, 2
+	xxpermdi	vs43, vs4, vs12, 2
+	xxpermdi	vs42, vs6, vs14, 2
+#endif
+	stxvp	vs32, 0(T2)
+	stxvp	vs40, 32(T2)
+	stxvp	vs34, 0(T3)
+	stxvp	vs42, 32(T3)
+	addi	CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro	ZERO4x4
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD4x4
+	LOAD4x4O 0, 0
+.endm
+
+.macro	LOAD4x4O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END4x4_NORMAL
+	END4x4 AO, BO, 32, 32
+.endm
+
+.macro	END4x4_WITHOUT_ADD
+	END4x4 AO, BO, 0, 0
+.endm
+
+.macro	END4x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	3, 32, 34
+	xvf32gerpp	2, 33, 34
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+.endm
+
+.macro	LOAD4x4_2
+	LOAD4x4_2O 0, 0
+.endm
+
+.macro	LOAD4x4_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs38, (32+\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END4x4_2
+  /*for load2 offset will be 64 and 64*/
+	KERNEL4x4_2	AO, BO, 64, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x4_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x4_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x4_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x4_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x4_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	3, 32, 34
+	xvf32gerpp	2, 33, 34
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	3, 36, 38
+	xvf32gerpp	2, 37, 38
+	xvf32gerpp	1, 36, 39
+	xvf32gerpp	0, 37, 39
+.if \Complete==0
+	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi		\BREG, \BREG, DISP8(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi		\BREG, \BREG, DISP8(\Index, 64)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x4
+	LOAD4x4
+	END4x4  AO, BO, 32, 32
+.endm
+
+.macro SAVE4x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxvp	vs26, 0(T1)
+#endif
+ #ifndef TRMMKERNEL
+	lxvp	vs28, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs30, 0(T3)
+#endif
+	GROUP1
+	AGG_GROUP1
+	GROUP2
+	AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+	MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xvaddsp	vs26, vs26, vs11
+	xvaddsp	vs27, vs27, vs9
+	xvaddsp	vs28, vs28, vs7
+	xvaddsp	vs29, vs29, vs5
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs0, vs8, 2
+	xxpermdi	vs26, vs2, vs10, 2
+	xxpermdi	vs29, vs12, vs4, 2
+	xxpermdi	vs28, vs14, vs6, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 0(T1)
+	stxvp	vs28, 0(T2)
+	stxvp	vs30, 0(T3)
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro	ZERO4x2
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD4x2
+	LOAD4x2O 0, 0
+.endm
+
+.macro	LOAD4x2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA+0)(AO)
+	lxvp	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END4x2_NORMAL
+	END4x2 AO, BO, 16, 32
+.endm
+
+.macro	END4x2_WITHOUT_ADD
+	END4x2 AO, BO, 0, 0
+.endm
+
+.macro	END4x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	1, 34, 32
+	xvf32gerpp	0, 35, 32
+.endm
+
+.macro	LOAD4x2_2
+	LOAD4x2_2O 0, 0
+.endm
+
+.macro	LOAD4x2_2O  OffsetA, OffsetB
+	lxvp	vs32, (\OffsetA)(AO)
+	lxvp	vs34, (0+\OffsetB)(BO)
+	lxvp	vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro	END4x2_2
+  /*for load2 offset will be 32 and 64*/
+	KERNEL4x2_2	AO, BO, 32, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x2_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x2_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x2_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x2_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x2_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	1, 34, 33
+	xvf32gerpp	0, 35, 33
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+	xvf32gerpp	1, 36, 32
+	xvf32gerpp	0, 37, 32
+.if \Complete==0
+	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+	addi		\BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+	addi		\BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x2
+	LOAD4x2
+	END4x2  AO, BO, 16, 32
+.endm
+
+.macro SAVE4x2
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs25, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs26, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs27, 0(T3)
+#endif
+	GROUP1
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 0
+	xxpermdi	vs9, vs10, vs2, 0
+	xxpermdi	vs3, vs0, vs8, 3
+	xxpermdi	vs11, vs2, vs10, 3
+	xvaddsp	vs24, vs24, vs1
+	xvaddsp	vs26, vs26, vs9
+	xvaddsp	vs25, vs25, vs3
+	xvaddsp	vs27, vs27, vs11
+#else
+	xxpermdi	vs24, vs8, vs0, 0
+	xxpermdi	vs26, vs10, vs2, 0
+	xxpermdi	vs25, vs0, vs8, 3
+	xxpermdi	vs27, vs2, vs10, 3
+#endif
+	stxv	vs24, 0(CO)
+	stxv	vs25, 0(T1)
+	stxv	vs26, 0(T2)
+	stxv	vs27, 0(T3)
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro	ZERO4x1
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD4x1
+	LOAD4x1O 0, 0
+.endm
+
+.macro	LOAD4x1O  OffsetA, OffsetB
+	lxsd	v0, (\OffsetA+0)(AO)
+	lxvp	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END4x1_NORMAL
+	END4x1 AO, BO,8, 32
+.endm
+
+.macro	END4x1_WITHOUT_ADD
+	END4x1 AO, BO, 0, 0
+.endm
+
+.macro	END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 35, 32
+	xvf32gerpp	    1, 34, 32
+.endm
+
+.macro	LOAD4x1_2
+	LOAD4x1_2O 0, 0
+.endm
+
+.macro	LOAD4x1_2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA)(AO)
+	vspltisb        v6, 0
+	xxpermdi        vs33, vs32, vs38, 0
+	xxpermdi        vs32, vs32, vs38, 2
+	lxvp	vs34, (0+\OffsetB)(BO)
+	lxvp	vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro	END4x1_2
+  /*for load2 offset will be 16 and 64*/
+	KERNEL4x1_2  AO, BO, 16, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 35, 32
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+	xvf32gerpp	    0, 37, 33
+	xvf32gerpp	    1, 36, 33
+.if \Complete==0
+	lxv	vs32, DISP2(\Index, \OffsetA)(\AREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+	xxpermdi        vs33, vs32, vs38, 0
+	xxpermdi        vs32, vs32, vs38, 2
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+	addi    \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x1
+	LOAD4x1
+	END4x1  AO, BO, 8, 32
+.endm
+
+.macro SAVE4x1
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	xxpermdi	vs32, vs32, vs36, 1
+	xxpermdi	vs40, vs40, vs44, 1
+	xxpermdi	vs33, vs33, vs37, 1
+	xxpermdi	vs41, vs41, vs45, 1
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v5, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v6, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v7, 0(T3)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs2, vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxspltd vs1, vs0, 0
+	xxspltd vs3, vs0, 1
+	xxspltd vs9, vs2, 0
+	xxspltd vs11, vs2, 1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+	xvaddsp	vs36, vs36, vs1
+	xvaddsp	vs37, vs37, vs3
+	xvaddsp	vs38, vs38, vs9
+	xvaddsp	vs39, vs39, vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+	xxspltd vs36, vs0, 0
+	xxspltd vs37, vs0, 1
+	xxspltd vs38, vs2, 0
+	xxspltd vs39, vs2, 1
+#endif
+	stxsd	v4, 0(CO)
+	stxsd	v5, 0(T1)
+	stxsd	v6, 0(T2)
+	stxsd	v7, 0(T3)
+	addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro	ZERO2x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD2x8
+	LOAD2x8O 0, 0
+.endm
+
+.macro	LOAD2x8O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END2x8_NORMAL
+	END2x8 AO, BO, 64, 16
+.endm
+
+.macro	END2x8_WITHOUT_ADD
+	END2x8 AO, BO, 0, 0
+.endm
+
+.macro	END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	3, 36, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+.endm
+
+.macro	LOAD2x8_2
+	LOAD2x8_2O 0, 0
+.endm
+
+.macro	LOAD2x8_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	lxvp	vs38, (64+\OffsetA)(AO)
+	lxvp	vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END2x8_2
+  /*for load2 offset will be 128 and 32*/
+	KERNEL2x8_2  AO, BO, 128, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	1, 32, 35
+
+.if \Complete==0
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	2, 41, 34
+	xvf32gerpp	3, 40, 34
+	xvf32gerpp	0, 39, 34
+	xvf32gerpp	1, 38, 34
+
+.if \Complete==0
+	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
+	lxvp	vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x8
+	LOAD2x8
+	END2x8  AO, BO, 64, 16
+.endm
+
+.macro SAVE2x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs28, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs30, 32(T1)
+#endif
+	add	T2, CO, T4
+	add	T3, T1, T4
+	GROUP1
+	AGG_GROUP1
+	GROUP2
+	AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+	MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs26, vs26, vs7
+	xvaddsp	vs27, vs27, vs5
+	xvaddsp	vs28, vs28, vs11
+	xvaddsp	vs29, vs29, vs9
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs12, vs4, 2
+	xxpermdi	vs26, vs14, vs6, 2
+	xxpermdi	vs29, vs0, vs8, 2
+	xxpermdi	vs28, vs2, vs10, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 32(CO)
+	stxvp	vs28, 0(T1)
+	stxvp	vs30, 32(T1)
+	addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro	ZERO2x4
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD2x4
+	LOAD2x4O 0, 0
+.endm
+
+.macro	LOAD2x4O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END2x4_NORMAL
+	END2x4 AO, BO, 32, 16
+.endm
+
+.macro	END2x4_WITHOUT_ADD
+	END2x4 AO, BO, 0, 0
+.endm
+
+.macro	END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+.endm
+
+.macro	LOAD2x4_2
+	LOAD2x4_2O 0, 0
+.endm
+
+.macro	LOAD2x4_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END2x4_2
+  /*for load2 offset will be 64 and 32*/
+	KERNEL2x4_2  AO, BO, 64, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	1, 32, 35
+.if \Complete==0
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	0, 37, 34
+	xvf32gerpp	1, 36, 34
+.if \Complete==0
+	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x4
+	LOAD2x4
+	END2x4  AO, BO, 32, 16
+.endm
+
+.macro SAVE2x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 0(T1)
+#endif
+	GROUP1
+	AGG_GROUP1
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xvaddsp	vs26, vs26, vs11
+	xvaddsp	vs27, vs27, vs9
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs0, vs8, 2
+	xxpermdi	vs26, vs2, vs10, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 0(T1)
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro	ZERO2x2
+	xxsetaccz	0
+.endm
+
+.macro	LOAD2x2
+	LOAD2x2O 0, 0
+.endm
+
+.macro	LOAD2x2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA+0)(AO)
+	lxv	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END2x2_NORMAL
+	END2x2 AO, BO, 16, 16
+.endm
+
+.macro	END2x2_WITHOUT_ADD
+	END2x2 AO, BO, 0, 0
+.endm
+
+.macro	END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	0, 34, 32
+.endm
+
+.macro	LOAD2x2_2
+	LOAD2x2_2O 0, 0
+.endm
+
+.macro	LOAD2x2_2O  OffsetA, OffsetB
+	lxvp	vs32, (\OffsetA)(AO)
+	lxvp	vs34, (0+\OffsetB)(BO)
+.endm
+
+.macro	END2x2_2
+  /*for load2 offset will be 32 and 32*/
+	KERNEL2x2_2  AO, BO, 32, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	0, 34, 32
+	xvf32gerpp	0, 35, 33
+.if \Complete==0
+	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
+	lxvp	vs34, DISP4(\Index, \OffsetA)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x2
+	LOAD2x2
+	END2x2  AO, BO, 16, 16
+.endm
+
+.macro SAVE2x2
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs26, 0(T1)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs8, vs36, permute_mask
+	xxperm	vs12, vs44, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs8, vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 0
+	xxpermdi	vs9, vs0, vs8, 3
+	xvaddsp	vs24, vs24, vs1
+	xvaddsp	vs26, vs26, vs9
+#else
+	xxpermdi	vs24, vs8, vs0, 0
+	xxpermdi	vs26, vs0, vs8, 3
+#endif
+	stxv	vs24, 0(CO)
+	stxv	vs26, 0(T1)
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro	ZERO2x1
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD2x1
+	LOAD2x1O 0, 0
+.endm
+
+.macro	LOAD2x1O  OffsetA, OffsetB
+	lxsd	v4, (\OffsetA+0)(AO)
+	lxv	vs0, (\OffsetB+0)(BO)
+	xxspltd  vs24, vs36, 0
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END2x1_NORMAL
+	END2x1 AO, BO,8, 16
+.endm
+
+.macro	END2x1_WITHOUT_ADD
+	END2x1 AO, BO, 0, 0
+.endm
+
+.macro	END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs0, vs24
+	xvmaddasp	vs40, vs0, vs26
+.endm
+
+.macro	LOAD2x1_2
+	LOAD2x1_2O 0, 0
+.endm
+
+.macro	LOAD2x1_2O  OffsetA, OffsetB
+	lxv	vs27, (\OffsetA)(AO)
+	lxvp	vs4, (0+\OffsetB)(BO)
+	xxspltd  vs8, vs27, 1
+	xxspltd  vs24, vs27, 0
+	xxperm    vs10, vs8, permute_mask
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END2x1_2
+  /*for load2 offset will be 16 and 32*/
+	KERNEL2x1_2  AO, BO, 16, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvmaddasp	vs32, vs5, vs8
+	xvmaddasp	vs40, vs5, vs10
+.if \Complete==0
+	lxv	vs27, DISP2(\Index, \OffsetA)(\AREG)
+	xxspltd  vs8, vs27, 1
+.endif
+.if \Complete==0
+	xxperm    vs10, vs8, permute_mask
+.endif
+	xvmaddasp	vs32, vs4, vs24
+	xvmaddasp	vs40, vs4, vs26
+.if \Complete==0
+	xxspltd  vs24, vs27, 0
+	xxperm   vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+	lxvp	vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x1
+	LOAD2x1
+	END2x1  AO, BO, 8, 16
+.endm
+
+.macro SAVE2x1
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v5, 0(T1)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxspltd vs1, vs0, 0
+	xxspltd vs3, vs0, 1
+ /*--v4==vs36 v5==vs37---*/
+	xvaddsp	vs36, vs36, vs1
+	xvaddsp	vs37, vs37, vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+	xxspltd vs36, vs0, 0
+	xxspltd vs37, vs0, 1
+#endif
+	stxsd	v4, 0(CO)
+	stxsd	v5, 0(T1)
+	addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro	ZERO1x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD1x8
+	LOAD1x8O 0, 0
+.endm
+
+.macro	LOAD1x8O  OffsetA, OffsetB
+	lxsd	v2, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END1x8_NORMAL
+	END1x8 AO, BO, 64,8
+.endm
+
+.macro	END1x8_WITHOUT_ADD
+	END1x8 AO, BO, 0, 0
+.endm
+
+.macro	END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+	xvf32gerpp	    2, 34, 37
+	xvf32gerpp	    3, 34, 36
+.endm
+
+.macro	LOAD1x8_2
+	LOAD1x8_2O 0, 0
+.endm
+
+.macro	LOAD1x8_2O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	vspltisb        v10, 0
+	xxpermdi        vs35, vs34, vs42, 0
+	xxpermdi        vs34, vs34, vs42, 2
+	lxvp	vs38, (64+\OffsetA)(AO)
+	lxvp	vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END1x8_2
+  /*for load2 offset will be 128 and 16*/
+	KERNEL1x8_2  AO, BO, 128, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    2, 34, 37
+	xvf32gerpp	    3, 34, 36
+.if \Complete==0
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    0, 35, 39
+	xvf32gerpp	    1, 35, 38
+.if \Complete==0
+	lxvp	vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    2, 35, 41
+	xvf32gerpp	    3, 35, 40
+.if \Complete==0
+	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+	xxpermdi        vs35, vs34, vs42, 0
+	xxpermdi        vs34, vs34, vs42, 2
+	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x8
+	LOAD1x8
+	END1x8  AO, BO, 64,8
+.endm
+
+.macro SAVE1x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	xxpermdi	vs32, vs32, vs36, 0
+	xxpermdi	vs33, vs33, vs37, 0
+	xxpermdi	vs34, vs34, vs38, 0
+	xxpermdi	vs35, vs35, vs39, 0
+	xxpermdi	vs40, vs40, vs44, 0
+	xxperm vs40, vs40, permute_mask
+	xxpermdi	vs41, vs41, vs45, 0
+	xxperm vs41, vs41, permute_mask
+	xxpermdi	vs42, vs42, vs46, 0
+	xxperm vs42, vs42, permute_mask
+	xxpermdi	vs43, vs43, vs47, 0
+	xxperm vs43, vs43, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	xxperm	vs2, vs34, permute_mask
+	xxperm	vs6, vs42, permute_mask
+	xxperm	vs3, vs35, permute_mask
+	xxperm	vs7, vs43, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES	vs34, vs2, vs42, vs6
+	AGGREGATE_REALS_IMAGES	vs35, vs3, vs43, vs7
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+	xxperm	vs2, vs3, vs28
+	xxperm	vs4, vs5, vs28
+	xxperm	vs6, vs7, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs2
+	xvaddsp	vs25, vs25, vs0
+	xvaddsp	vs26, vs26, vs6
+	xvaddsp	vs27, vs27, vs4
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 32(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+	stxv	vs2, 16(CO)
+	stxv	vs4, 32(CO)
+	stxv	vs6, 48(CO)
+#endif
+	addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro	ZERO1x4
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD1x4
+	LOAD1x4O 0, 0
+.endm
+
+.macro	LOAD1x4O  OffsetA, OffsetB
+	lxsd	v2, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END1x4_NORMAL
+	END1x4 AO, BO, 32,8
+.endm
+
+.macro	END1x4_WITHOUT_ADD
+	END1x4 AO, BO, 0, 0
+.endm
+
+.macro	END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.endm
+
+.macro	LOAD1x4_2
+	LOAD1x4_2O 0, 0
+.endm
+
+.macro	LOAD1x4_2O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	vspltisb        v6, 0
+	xxpermdi        vs35, vs34, vs38, 0
+	xxpermdi        vs34, vs34, vs38, 2
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END1x4_2
+  /*for load2 offset will be 64 and 16*/
+	KERNEL1x4_2  AO, BO, 64, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    0, 35, 37
+	xvf32gerpp	    1, 35, 36
+.if \Complete==0
+	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+	xxpermdi        vs35, vs34, vs38, 0
+	xxpermdi        vs34, vs34, vs38, 2
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x4
+	LOAD1x4
+	END1x4	AO, BO, 32,8
+.endm
+
+.macro SAVE1x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	xxpermdi	vs32, vs32, vs36, 0
+	xxpermdi	vs40, vs40, vs44, 0
+	xxpermdi	vs33, vs33, vs37, 0
+	xxpermdi	vs41, vs41, vs45, 0
+	xxperm vs40, vs40, permute_mask
+	xxperm vs41, vs41, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+	xxperm	vs2, vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs2
+	xvaddsp	vs25, vs25, vs0
+	stxvp	vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+	stxv	vs2, 16(CO)
+#endif
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro	ZERO1x2
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD1x2
+	LOAD1x2O 0, 0
+.endm
+
+.macro	LOAD1x2O  OffsetA, OffsetB
+	lxsd	vs4, (\OffsetB+0)(BO)
+	lxv	vs0, (\OffsetA+0)(AO)
+	xxspltd   vs24, vs36, 0
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END1x2_NORMAL
+	END1x2 AO, BO, 16,8
+.endm
+
+.macro	END1x2_WITHOUT_ADD
+	END1x2 AO, BO, 0, 0
+.endm
+
+.macro	END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs0, vs24
+	xvmaddasp	vs40, vs0, vs26
+.endm
+
+.macro	LOAD1x2_2
+	LOAD1x2_2O 0, 0
+.endm
+
+.macro	LOAD1x2_2O  OffsetA, OffsetB
+	lxv	vs27, (\OffsetB)(BO)
+	lxvp	vs4, (0+\OffsetA)(AO)
+	xxspltd  vs8, vs27, 1
+	xxspltd  vs24, vs27, 0
+	xxperm    vs10, vs8, permute_mask
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END1x2_2
+  /*for load2 offset will be 32 and 16*/
+	KERNEL1x2_2  AO, BO, 32, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+.if \Complete==0
+	lxv	vs27, DISP2(\Index, \OffsetB)(\BREG)
+.endif
+	xvmaddasp	vs32, vs5, vs8
+	xvmaddasp	vs40, vs5, vs10
+
+.if \Complete==0
+	xxspltd  vs8, vs27, 1
+	xxperm    vs10, vs8, permute_mask
+.endif
+	xvmaddasp	vs32, vs4, vs24
+	xvmaddasp	vs40, vs4, vs26
+.if \Complete==0
+	lxvp	vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+	xxspltd  vs24, vs27, 0
+	xxperm    vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x2
+	LOAD1x2
+	END1x2  AO, BO, 16,8
+.endm
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs0
+	stxv	vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+#endif
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro	ZERO1x1
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD1x1
+	LOAD1x1O 0, 0
+.endm
+
+.macro	LOAD1x1O  OffsetA, OffsetB
+	lxsd	v4, (\OffsetB+0)(BO)
+	lxsd	v5, (\OffsetA+0)(AO)
+	xxperm    vs38, vs36, permute_mask
+.endm
+
+.macro	END1x1_NORMAL
+	END1x1 AO, BO,8,8
+.endm
+
+.macro	END1x1_WITHOUT_ADD
+	END1x1 AO, BO, 0, 0
+.endm
+
+.macro	END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs37, vs36
+	xvmaddasp	vs40, vs37, vs38
+.endm
+
+.macro	LOAD1x1_2
+	LOAD1x1_2O 0, 0
+.endm
+
+.macro	LOAD1x1_2O  OffsetA, OffsetB
+	lxv	vs8, (\OffsetB)(BO)
+	lxv	vs4, (0+\OffsetA)(AO)
+	xxperm    vs10, vs8, permute_mask
+.endm
+
+.macro	END1x1_2
+  /*for load2 offset will be 16 and 16*/
+	KERNEL1x1_2  AO, BO, 16, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvmaddasp	vs32, vs4, vs8
+	xvmaddasp	vs40, vs4, vs10
+.if \Complete==0
+	lxv	vs8, DISP2(\Index, \OffsetB)(\BREG)
+	lxv	vs4, DISP2(\Index, \OffsetB)(\AREG)
+	xxperm    vs10, vs8, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x1
+	LOAD1x1
+	END1x1  AO, BO, 8,8
+.endm
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+  /*aggregate x2*/
+	xxpermdi	vs33, vs32, vs32, 2
+	xxpermdi	vs41, vs40, vs40, 2
+	xvaddsp	vs32, vs32, vs33
+	xvaddsp	vs40, vs40, vs41
+
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs37, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs37, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs37, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs36, vs36, vs37
+	stxsd	v4, 0(CO)
+#else
+/* vs37 is v5 */
+	stxsd	v5, 0(CO)
+#endif
+	addi  CO, CO, 8
+.endm
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+.macro SHIFT_REG	REG1,REG2,SHIFT_VAL
+.if \SHIFT_VAL==16
+	slwi		\REG1, \REG2, 7
+.elseif \SHIFT_VAL==8
+	slwi		\REG1, \REG2, 6
+.elseif \SHIFT_VAL==4
+	slwi		\REG1, \REG2, 5
+.elseif \SHIFT_VAL==2
+	slwi		\REG1, \REG2, 4
+.elseif \SHIFT_VAL==1
+	slwi		\REG1, \REG2, 3
+.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*8;
+// 		ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
+#if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+/* ptrbb = bb;*/
+	mr \PTR_B, \B_VAL     /* refresh BPOINT */
+#else
+/*
+// ptrba  =ptrba+ off*C_A;
+// ptrbb = bb + off*C_B;
+*/
+	SHIFT_REG T4, \OFF_VAL, \C_B	/* Number of values in B shifted  */
+	SHIFT_REG T2, \OFF_VAL, \C_A	/* Number of values in A shifted  */
+	add	\PTR_B, \B_VAL, T4	/* Add values to BO */
+	add	\PTR_A, \PTR_A, T2	/* Add values to AO  */
+#endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+8;	// number of values in A
+// #else
+// 		temp = off+4;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+	/* temp = bk-off;*/
+	sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #elif defined(LEFT)
+	/* temp = off+INCR_A;	// number of values in A */
+	addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+	/* temp = off+INCR_B	// number of values in B*/
+	addi \TEMP_BK, \OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 8; // number of values in A
+// #else
+// 		temp -= 4; // number of values in B
+// #endif
+// 		ptrba += temp*8;
+// 		ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// 		off += 8; // number of values in A
+// #endif
+*/
+.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	/*temp = bk - off;*/
+	sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #ifdef LEFT
+	/*temp -= 8; // number of values in A*/
+	addi \TEMP_BK, \TEMP_BK,-\C_A
+    #else
+	/*temp -= 4; // number of values in B*/
+	addi \TEMP_BK, \TEMP_BK,-\C_B
+    #endif
+	/*ptrba += temp*C_A;
+	ptrbb += temp*C_B;*/
+	SHIFT_REG T4, \TEMP_BK, \C_A
+	SHIFT_REG T2, \TEMP_BK, \C_B
+	add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
+	add \PTR_B, \PTR_B, T2
+    #endif
+    #ifdef LEFT
+	/*off += 8; // number of values in A*/
+	addi \OFF_VAL, \OFF_VAL, \C_A
+    #endif
+.endm
diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index eec3fa37c..8663039c5 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_n.c"
+#else
 
 #include <stdlib.h>
 #include <stdio.h>
@@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 
     return (0);
 }
-
+#endif
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index 691f7a3d3..1bfc235db 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_t.c"
+#else
 
 #include "common.h"
 
@@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     return (0);
 
 }
-
+#endif
diff --git a/kernel/power/crot.c b/kernel/power/crot.c
index 2a5835546..84ba5d913 100644
--- a/kernel/power/crot.c
+++ b/kernel/power/crot.c
@@ -27,7 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
 #include "common.h"
  
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 
 static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
 {
@@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
 }
  
 #endif
+#endif
 
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 
     if ( (inc_x == 1) && (inc_y == 1) )
     {
-
+#if defined(__VEC__) || defined(__ALTIVEC__)
         BLASLONG n1 = n & -8; 
         if ( n1 > 0 )
         { 
@@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
             i=n1; 
             ix=2*n1; 
         }
-
+#endif
          while(i < n)
            {
                 temp[0]   = c*x[ix]   + s*y[ix] ;
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
index 31e02fe5a..5144a2e93 100644
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)  || defined(POWER9)
+#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "cswap_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_32
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index d0e060977..999dc677a 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "dasum_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
index f09611ff0..2de4e0911 100644
--- a/kernel/power/daxpy.c
+++ b/kernel/power/daxpy.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "daxpy_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_8
diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c
index 27b39144b..24279f8a2 100644
--- a/kernel/power/dcopy.c
+++ b/kernel/power/dcopy.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "dcopy_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_32
 
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
index f985df1c5..c5493015a 100644
--- a/kernel/power/ddot.c
+++ b/kernel/power/ddot.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)  || defined(POWER9)
+#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "ddot_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_8
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
new file mode 100644
index 000000000..a0bc1a777
--- /dev/null
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -0,0 +1,864 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#ifdef TRMMKERNEL
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc ((void *)result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[3] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc ((void *)result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[3] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[1] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc ((void *)result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc ((void *)result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[1] * alpha;
+#endif
+
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+          BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+        REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+  v4sf_t valpha = { alpha, alpha };
+  N = n >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+          FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  BLASLONG l = 0;
+	  PREFETCH1 (CO, 0);
+	  PREFETCH1 (CO + ldc, 0);
+	  PREFETCH1 (CO + ldc + ldc, 0);
+	  PREFETCH1 (CO + ldc + ldc + ldc, 0);
+	  PREFETCH1 (CO, 128);
+	  PREFETCH1 (CO + ldc, 128);
+	  PREFETCH1 (CO + ldc + ldc, 128);
+	  PREFETCH1 (CO + ldc + ldc + ldc, 128);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC (&acc1, 2);
+	  SAVE_ACC (&acc3, 6);
+	  SAVE_ACC (&acc4, 8);
+	  SAVE_ACC (&acc6, 12);
+	  SAVE_ACC (&acc5, 10);
+	  SAVE_ACC (&acc7, 14);
+	  AO += temp << 4;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 4)
+#endif
+	  CO += 16;
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC (&acc1, 2);
+	  SAVE_ACC (&acc3, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 4)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 4)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 4)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
+	      v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t1[0];
+	  CO[3 * ldc] = t1[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t1[0];
+	  CO[3 * ldc] += t1[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 4)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;                 // number of values in A
+#endif
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  SAVE2x4_ACC (&acc2, 4);
+	  SAVE2x4_ACC (&acc3, 6);
+	  SAVE2x4_ACC (&acc4, 8);
+	  SAVE2x4_ACC (&acc5, 10);
+	  SAVE2x4_ACC (&acc6, 12);
+	  SAVE2x4_ACC (&acc7, 14);
+	  CO += 16;
+	  AO += temp << 4;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 2)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  SAVE2x4_ACC (&acc2, 4);
+	  SAVE2x4_ACC (&acc3, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 2)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 1];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                 // number of values in A
+#endif
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  v4sf_t t4 = { 0, 0 };
+	  v4sf_t t5 = { 0, 0 };
+	  v4sf_t t6 = { 0, 0 };
+	  v4sf_t t7 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
+	      v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
+	      v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
+	      v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
+	      v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
+	      v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
+	      v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	      t4 += rowA4 * rowB;
+	      t5 += rowA5 * rowB;
+	      t6 += rowA6 * rowB;
+	      t7 += rowA7 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+	  t4 = t4 * valpha;
+	  t5 = t5 * valpha;
+	  t6 = t6 * valpha;
+	  t7 = t7 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+	  CO[4] = t2[0];
+	  CO[5] = t2[1];
+	  CO[6] = t3[0];
+	  CO[7] = t3[1];
+	  CO[8] = t4[0];
+	  CO[9] = t4[1];
+	  CO[10] = t5[0];
+	  CO[11] = t5[1];
+	  CO[12] = t6[0];
+	  CO[13] = t6[1];
+	  CO[14] = t7[0];
+	  CO[15] = t7[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+	  CO[4] += t2[0];
+	  CO[5] += t2[1];
+	  CO[6] += t3[0];
+	  CO[7] += t3[1];
+	  CO[8] += t4[0];
+	  CO[9] += t4[1];
+	  CO[10] += t5[0];
+	  CO[11] += t5[1];
+	  CO[12] += t6[0];
+	  CO[13] += t6[1];
+	  CO[14] += t7[0];
+	  CO[15] += t7[1];
+#endif
+	  AO += temp << 4;
+	  BO += temp;
+	  CO += 16;
+	  i -= 16;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 1)
+#endif
+	}
+      while (i >= 8)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
+	      v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
+	      v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+	  CO[4] = t2[0];
+	  CO[5] = t2[1];
+	  CO[6] = t3[0];
+	  CO[7] = t3[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+	  CO[4] += t2[0];
+	  CO[5] += t2[1];
+	  CO[6] += t3[0];
+	  CO[7] += t3[1];
+#endif
+	  AO += temp << 3;
+	  BO += temp;
+	  CO += 8;
+	  i -= 8;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      while (i >= 4)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+#endif
+	  AO += temp << 2;
+	  BO += temp;
+	  CO += 4;
+	  i -= 4;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 1)
+#endif
+	}
+      while (i >= 2)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+#endif
+	  AO += temp << 1;
+	  BO += temp;
+	  CO += 2;
+	  i -= 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      while (i >= 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  FLOAT t = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      t += AO[l] * BO[l];
+	    }
+	  AO += temp;
+	  BO += temp;
+#if defined(TRMMKERNEL)
+	  CO[0] = t * alpha;
+#else
+	  CO[0] += t * alpha;
+#endif
+	  CO += 1;
+	  i -= 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                 // number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
index 1a3d7669c..ac365b3b2 100644
--- a/kernel/power/dgemv_n.c
+++ b/kernel/power/dgemv_n.c
@@ -38,9 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "dgemv_n_microk_power8.c"
 #endif
+#endif
 
 
 #define NBMAX 4096
diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
new file mode 100644
index 000000000..4be8a5f9b
--- /dev/null
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -0,0 +1,268 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+
+static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
+{
+  double *a0;
+  double *a1;
+  double *a2;
+  double *a3;
+
+  __asm__
+    (
+       "lxvp	40, 0(%10)	\n\t"	// x0, x1
+       XXSPLTD_S(32,%x9,0)	// alpha, alpha
+
+       "sldi		%6, %13, 3	\n\t"	// lda * sizeof (double)
+
+       "xvmuldp		34, 41, 32	\n\t"	// x0 * alpha, x1 * alpha
+       "xvmuldp		35, 40, 32	\n\t"	// x2 * alpha, x3 * alpha
+
+       "add		%4, %3, %6	\n\t"	// a0 = ap, a1 = a0 + lda
+       "add		%6, %6, %6	\n\t"	// 2 * lda
+
+       XXSPLTD_S(32,34,1)	// x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,0)	// x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,1)	// x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,0)	// x3 * alpha, x3 * alpha
+
+       "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
+       "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
+
+       "dcbt		0, %3		\n\t"
+       "dcbt		0, %4		\n\t"
+       "dcbt		0, %5		\n\t"
+       "dcbt		0, %6		\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "dcbt		0, %2		\n\t"
+
+       "addi		%3, %3, 32	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "addi		%6, %6, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp	46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "lxvp		40, 0(%3)	\n\t"	// a0[0], a0[1]
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "lxvp		42, 0(%4)	\n\t"	// a1[0], a1[1]
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "lxvp		44, 0(%5)	\n\t"	// a2[0], a2[1]
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "lxvp		46, 0(%6)	\n\t"	// a3[0], a3[1]
+
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "lxvp		36, 0(%2)	\n\t"	// y0, y1
+
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"
+
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"
+
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"
+
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"
+
+       "stxvp		36, 0(%2)	\n\t"	// y0, y1
+
+     "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
+     "#a0=%3 a1=%4 a2=%5 a3=%6"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (y),	// 2
+       "=b" (a0),	// 3
+       "=b" (a1),	// 4
+       "=&b" (a2),	// 5
+       "=&b" (a3)	// 6
+     :
+       "m" (*x),
+       "m" (*ap),
+       "d" (alpha),	// 9
+       "r" (x),		// 10
+       "b" (16),	// 11
+       "3" (ap),	// 12
+       "4" (lda)	// 13
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c
new file mode 100644
index 000000000..ad5f1ba0d
--- /dev/null
+++ b/kernel/power/dgemv_n_power10.c
@@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef __vector_pair          __attribute__((aligned(8))) vecp_t;
+
+#include "dgemv_n_microk_power10.c"
+
+#define MMA(X, APTR, ACC) \
+        rX = (vec_t *) & X; \
+        rowA = *((vecp_t*)((void*)&APTR)); \
+        __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
+
+#define SAVE(ACC, Z) \
+        rowC = (v4sf_t *) &y[Z]; \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][1] = result[1][0]; \
+        result[2][1] = result[3][0]; \
+        rowC[0] += valpha * result[0]; \
+        rowC[1] += valpha * result[2];
+
+void
+dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
+                    FLOAT * y, FLOAT alpha)
+{
+  BLASLONG i, j, tmp;
+  FLOAT *a0 = a_ptr;
+  FLOAT *x1 = xo;
+  vector double valpha = { alpha, alpha };
+  v4sf_t *rowC;
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  vecp_t rowA;
+  vec_t *rX;
+  tmp = (n / 32) * 32;
+  for (i = 0; i < tmp; i += 32)
+    {
+      xo = x1;
+      a0 = a_ptr;
+      __builtin_mma_xxsetaccz (&acc0);
+      __builtin_mma_xxsetaccz (&acc1);
+      __builtin_mma_xxsetaccz (&acc2);
+      __builtin_mma_xxsetaccz (&acc3);
+      __builtin_mma_xxsetaccz (&acc4);
+      __builtin_mma_xxsetaccz (&acc5);
+      __builtin_mma_xxsetaccz (&acc6);
+      __builtin_mma_xxsetaccz (&acc7);
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
+          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
+          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
+          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
+          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
+          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
+          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
+          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      SAVE (&acc0, i + 0);
+      SAVE (&acc1, i + 4);
+      SAVE (&acc2, i + 8);
+      SAVE (&acc3, i + 12);
+      SAVE (&acc4, i + 16);
+      SAVE (&acc5, i + 20);
+      SAVE (&acc6, i + 24);
+      SAVE (&acc7, i + 28);
+
+    }
+  for (i = tmp; i < n; i += 4)
+    {
+      xo = x1;
+      a0 = a_ptr;
+      __builtin_mma_xxsetaccz (&acc0);
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      for (j = 0; j < 32; j++)
+        {
+          __builtin_prefetch (xo+j);
+          __builtin_prefetch (a0+i+j+lda);
+          MMA (xo[j], a0[i + j * lda], &acc0);
+        }
+      xo += 32;
+      a0 += lda << 5;
+      SAVE (&acc0, i);
+    }
+}
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+	FLOAT *a0 = a_ptr;
+	FLOAT *a1 = a0 + lda;
+	FLOAT *a2 = a1 + lda;
+	FLOAT *a3 = a2 + lda;
+
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+
+	for ( i=0; i<2; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];		
+	}
+}
+
+
+#endif
+
+#ifndef HAVE_KERNEL_4x1
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG i;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+
+	for ( i=0; i<1; i++)
+		x[i] = xo[i] * alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+
+
+#endif
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda128 = lda << 7;
+
+	FLOAT xbuffer[8] __attribute__ ((aligned (16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	BLASLONG n128 = n >> 7;
+	n1 = (n - (n128 * 128)) >> 2;
+	n2 = (n - (n128 * 128)) & 3;
+
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*8);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+			for( i = 0; i < n128 ; i++)
+			{
+				dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda128;
+				x_ptr += 128;
+			}
+
+			for( i = 0; i < n1 ; i++)
+			{
+				dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+			for( i = 0; i < n128 ; i++)
+			{
+	                        FLOAT xbuffer[128] __attribute__ ((aligned (16)));
+				BLASLONG j;
+				for ( j = 0; j < 128 ; j++)
+				{
+					xbuffer[j] = x_ptr[0];
+				        x_ptr += inc_x;
+				}
+				dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda128;
+			}
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c
index 5d43f673f..c07b3c223 100644
--- a/kernel/power/dgemv_t.c
+++ b/kernel/power/dgemv_t.c
@@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_t.c"
+#else
+
 #include "common.h"
 
 #define NBMAX 1024
 //#define PREFETCH 1
+
 #include <altivec.h> 
 
 #define HAVE_KERNEL4x8_ASM 1
 
-
 #if defined(HAVE_KERNEL4x8_ASM)
 static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
 
@@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "stxvd2x 39, %[off], %[y]  \n\t"
             "stxvd2x 40, %[off2], %[y]  \n\t"     
                  
-            : [memy] "+m" (*(const double (*)[8])y),
+            : [memy] "+m" (*(double (*)[8])y),
             [n] "+&r" (n),
             [a0] "=b" (a0),
             [a1] "=&b" (a1),
@@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             [off2]"=&b" (off2),
             [temp] "=&b" (tempR)
             : [memx] "m" (*(const double (*)[n])x),
-            [mem_ap] "m" (*(const double (*)[]) ap),
+            [mem_ap] "m" (*(const double (*)[n*8]) ap),
             [alpha] "d" (alpha),
             "[a0]" (ap),
             [x] "b" (x),
@@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
     return (0);
 
 }
+#endif
 
diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
new file mode 100644
index 000000000..3db4d5785
--- /dev/null
+++ b/kernel/power/dgemv_t_power10.c
@@ -0,0 +1,840 @@
+/***************************************************************************
+Copyright (c) 2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 1024
+//#define PREFETCH 1
+#include <altivec.h> 
+
+#define HAVE_KERNEL4x8_ASM 1
+
+
+#if defined(HAVE_KERNEL4x8_ASM)
+static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
+
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    BLASLONG off2;
+    BLASLONG tempR;
+    __asm__(
+         
+            "sldi   %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
+            "sldi   %[off], %[off], 3 \n\t" // lda * sizeof (double)    
+            "xxlxor 34,34,34  \n\t"
+            "xxlxor 35,34,34  \n\t"
+            "add    %[a2], %[a0], %[temp]    \n\t"
+            "add    %[a1], %[a0], %[off]     \n\t"
+            "xxlxor 4,34,34 \n\t"
+            "xxlxor 5,34,34 \n\t"
+            "xxlxor 6,34,34 \n\t"
+            "xxlxor 7,34,34 \n\t"            
+            "add    %[a3], %[a2], %[off]     \n\t"
+            "add    %[a4], %[a2], %[temp]    \n\t"            
+ 
+            "xxlxor 8,34,34 \n\t"
+            "xxlxor 9,34,34 \n\t"              
+            "add    %[a5], %[a3], %[temp]    \n\t"
+            "li     %[off],0    \n\t"
+            "li     %[off2],16  \n\t" 
+  
+            "add    %[a6], %[a4], %[temp]    \n\t" 
+            "add    %[a7], %[a5], %[temp]    \n\t"
+    
+
+
+
+            "lxvp 32, 0(%[x])   \n\t"
+            "lxvp 36, 0(%[a0])  \n\t"
+            "lxvp 38, 0(%[a1])  \n\t"
+            "lxvp 40, 0(%[a2])  \n\t"
+            "lxvp 42, 0(%[a3])  \n\t"
+            "lxvp 44, 0(%[a4])  \n\t"
+            "lxvp 46, 0(%[a5])  \n\t"
+            "lxvp 48, 0(%[a6])  \n\t"
+            "lxvp 50, 0(%[a7])  \n\t"
+#if defined(PREFETCH)    
+            "li     %[temp],896  \n\t"
+#endif    
+            "addic. %[n],%[n],-4 \n\t"
+
+            "li       %[off],32 \n\t" 
+
+
+            "ble-     two%=      \n\t"
+
+            //--------------------------------------------------           
+            ".align   5           \n\t"
+            "one%=:                     \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "addi   %[off2],  %[off2],32 \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 32(%[a0])  \n\t"
+            "lxvp 38, 32(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 32(%[a2])  \n\t"
+            "lxvp 42, 32(%[a3])  \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 32(%[a4])  \n\t"
+            "lxvp 46, 32(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+            "addic. %[n],%[n],-4 \n\t"                        
+            "lxvp 48, 32(%[a6])  \n\t"
+            "lxvp 50, 32(%[a7])  \n\t"
+            "lxvp 32, 32(%[x])   \n\t"
+            "ble- two%=  \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "addi   %[off2],  %[off2],32 \n\t" 
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 64(%[a0])  \n\t"
+            "lxvp 38, 64(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 64(%[a2])  \n\t"
+            "lxvp 42, 64(%[a3])  \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 64(%[a4])  \n\t"
+            "lxvp 46, 64(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+            "addic. %[n],%[n],-4 \n\t"                        
+            "lxvp 48, 64(%[a6])  \n\t"
+            "lxvp 50, 64(%[a7])  \n\t"
+            "lxvp 32, 64(%[x])   \n\t"
+            "ble- two%=  \n\t"
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+#if defined(PREFETCH)            
+            "addi    %[temp],%[temp],128 \n\t"   
+#endif                                             
+            "addi   %[off2],  %[off2],32 \n\t" 
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a0]  \n\t"
+#endif            
+
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 96(%[a0])  \n\t"
+            "lxvp 38, 96(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a1]  \n\t"
+#endif            
+            "lxvp 40, 96(%[a2])  \n\t"
+            "lxvp 42, 96(%[a3])  \n\t"
+            "addi       %[off],  %[off],32 \n\t"
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 96(%[a4])  \n\t"
+            "lxvp 46, 96(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a3]  \n\t"
+#endif            
+            "lxvp 48, 96(%[a6])  \n\t"
+            "lxvp 50, 96(%[a7])  \n\t"
+            "lxvp 32, 96(%[x])   \n\t"
+           
+            "addic. %[n],%[n],-4 \n\t"                        
+            "ble- two%=  \n\t"            
+ 
+            "addi   %[off2],  %[off2],32 \n\t" 
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a2]  \n\t"
+#endif            
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a4]  \n\t"                         
+#endif            
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t"
+
+#if defined(PREFETCH)
+          "dcbt   %[temp],%[a5]  \n\t"
+#endif              
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "lxvp 36, 128(%[a0])  \n\t"
+            "lxvp 38, 128(%[a1])  \n\t"
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "addi       %[off],  %[off],32 \n\t"
+            "lxvp 40, 128(%[a2])  \n\t"
+            "lxvp 42, 128(%[a3])  \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a6]  \n\t"  
+#endif            
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "lxvp 44, 128(%[a4])  \n\t"
+            "lxvp 46, 128(%[a5])  \n\t"
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t" 
+            
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[a7]  \n\t"  
+#endif            
+            "addic. %[n],%[n],-4 \n\t"
+            "lxvp 48, 128(%[a6])  \n\t"
+            "lxvp 50, 128(%[a7])  \n\t"
+            "lxvp 32, 128(%[x])   \n\t"
+#if defined(PREFETCH)
+            "dcbt   %[temp],%[x]  \n\t" 
+#endif            
+	    "addi    %[a0], %[a0], 128     \n\t"
+	    "addi    %[a1], %[a1], 128     \n\t"
+	    "addi    %[a2], %[a2], 128     \n\t"
+	    "addi    %[a3], %[a3], 128     \n\t"
+	    "addi    %[a4], %[a4], 128     \n\t"
+	    "addi    %[a5], %[a5], 128    \n\t"
+	    "addi    %[a6], %[a6], 128     \n\t"
+	    "addi    %[a7], %[a7], 128     \n\t"
+	    "addi    %[x], %[x], 128     \n\t"
+            "bgt+ one%=  \n\t"
+            ".align   5           \n\t"
+            "two%=: \n\t"
+            //--------------------------------------------
+
+            "xvmaddadp   34,36,32  \n\t"
+            "xvmaddadp   35,38,32  \n\t"
+            "xvmaddadp   4,40,32  \n\t"
+            "xvmaddadp   5,42,32  \n\t"
+            "xvmaddadp   6,44,32  \n\t"
+            "xvmaddadp   7,46,32  \n\t" 
+            "xvmaddadp   8,48,32  \n\t"
+            "xvmaddadp   9,50,32  \n\t" 
+            XXSPLTD_S(36,%x[alpha],0)
+            "xvmaddadp  34,37,33  \n\t"
+            "xvmaddadp  35,39,33  \n\t"            
+            "xvmaddadp  4,41,33  \n\t"
+            "xvmaddadp  5,43,33  \n\t"            
+            "xvmaddadp  6,45,33  \n\t"
+            "xvmaddadp  7,47,33  \n\t"            
+            "xvmaddadp  8,49,33  \n\t"
+            "xvmaddadp  9,51,33  \n\t"  
+
+            "lxvp 38, 0(%[y]) \n\t"
+            "lxvp 40, 32(%[y]) \n\t"
+
+ 
+
+            XXMRGLD_S(42,35,34)
+            XXMRGHD_S(43,35,34)
+
+            XXMRGLD_S(44,5,4)
+            XXMRGHD_S(45,5,4)
+
+            "xvadddp 42,42,43 \n\t"
+
+            XXMRGLD_S(46,7,6)
+            XXMRGHD_S(47,7,6)
+
+            "xvadddp 44,44,45 \n\t"
+
+            XXMRGLD_S(48,9,8)
+            XXMRGHD_S(49,9,8)
+
+            "xvadddp 46,46,47 \n\t"
+            
+            "xvmaddadp  39,42,36  \n\t"
+            "xvmaddadp  38,44,36  \n\t"
+            
+            "xvadddp 48,48,49 \n\t"
+
+            "xvmaddadp  41,46,36  \n\t"
+
+            "stxvp 38, 0(%[y]) \n\t"
+            "xvmaddadp  40,48,36  \n\t" 
+            "stxvp 40, 32(%[y])  \n\t"
+                 
+            : [memy] "+m" (*(double (*)[8])y),
+            [n] "+&r" (n),
+            [a0] "=b" (a0),
+            [a1] "=&b" (a1),
+            [a2] "=&b" (a2),
+            [a3] "=&b" (a3),
+            [a4] "=&b" (a4),
+            [a5] "=&b" (a5),
+            [a6] "=&b" (a6),
+            [a7] "=&b" (a7),            
+            [off] "+&b" (lda),
+            [off2]"=&b" (off2),
+            [temp] "=&b" (tempR)
+            : [memx] "m" (*(const double (*)[n])x),
+            [mem_ap] "m" (*(const double (*)[n*8]) ap),
+            [alpha] "d" (alpha),
+            "[a0]" (ap),
+            [x] "b" (x),
+            [y] "b" (y)
+            : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
+            "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            );
+    return;
+}
+#else
+static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;
+#if defined(PREFETCH)  
+    BLASLONG j, c, k;
+#endif    
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector double temp0 = {0, 0};
+    register __vector double temp1 = {0, 0};
+    register __vector double temp2 = {0, 0};
+    register __vector double temp3 = {0, 0};
+    register __vector double temp4 = {0, 0};
+    register __vector double temp5 = {0, 0};
+    register __vector double temp6 = {0, 0};
+    register __vector double temp7 = {0, 0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector double*) a0;
+    va1 = (__vector double*) a1;
+    va2 = (__vector double*) a2;
+    va3 = (__vector double*) a3;
+    va4 = (__vector double*) a4;
+    va5 = (__vector double*) a5;
+    va6 = (__vector double*) a6;
+    va7 = (__vector double*) a7;
+    v_x = (__vector double*) x;
+ 
+#if defined(PREFETCH)
+
+    c = n >> 1;
+
+    for (j = 0; j < c; j += 64) {
+        k = (c - j) > 64 ? 64 : (c - j);
+        __builtin_prefetch(v_x + 64);
+        __builtin_prefetch(va0 + 64);
+        __builtin_prefetch(va1 + 64);
+        __builtin_prefetch(va2 + 64);
+        __builtin_prefetch(va3 + 64);
+        __builtin_prefetch(va4 + 64);
+        __builtin_prefetch(va5 + 64);
+        __builtin_prefetch(va6 + 64);
+        __builtin_prefetch(va7 + 64); 
+         for (i = 0; i < k; i += 2) {
+#else
+        
+        for (i = 0; i < n/2; i += 2) {
+#endif
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i];
+            temp0 += v_x[i + 1] * va0[i + 1];
+            temp1 += v_x[i + 1] * va1[i + 1];
+            temp2 += v_x[i + 1] * va2[i + 1];
+            temp3 += v_x[i + 1] * va3[i + 1];
+
+            temp4 += v_x[i + 1] * va4[i + 1];
+            temp5 += v_x[i + 1] * va5[i + 1];
+            temp6 += v_x[i + 1] * va6[i + 1];
+            temp7 += v_x[i + 1] * va7[i + 1];
+        }
+#if defined(PREFETCH)
+        va0 += 64;
+        va1 += 64;
+        va2 += 64;
+        va3 += 64;
+        va4 += 64;
+        va5 += 64;
+        va6 += 64;
+        va7 += 64;
+        v_x += 64;
+
+    }
+#endif
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[1] += alpha * (temp1[0] + temp1[1]);
+    y[2] += alpha * (temp2[0] + temp2[1]);
+    y[3] += alpha * (temp3[0] + temp3[1]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]);
+    y[5] += alpha * (temp5[0] + temp5[1]);
+    y[6] += alpha * (temp6[0] + temp6[1]);
+    y[7] += alpha * (temp7[0] + temp7[1]);
+
+}
+
+#endif
+ 
+
+static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* va1 = (__vector double*) a1;
+    __vector double* va2 = (__vector double*) a2;
+    __vector double* va3 = (__vector double*) a3;
+    __vector double* v_x = (__vector double*) x;
+    register __vector double temp0 = {0, 0};
+    register __vector double temp1 = {0, 0};
+    register __vector double temp2 = {0, 0};
+    register __vector double temp3 = {0, 0};
+    register __vector double temp4 = {0, 0};
+    register __vector double temp5 = {0, 0};
+    register __vector double temp6 = {0, 0};
+    register __vector double temp7 = {0, 0};
+
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i];
+        temp4 += v_x[i + 1] * va0[i + 1];
+        temp5 += v_x[i + 1] * va1[i + 1];
+        temp6 += v_x[i + 1] * va2[i + 1];
+        temp7 += v_x[i + 1] * va3[i + 1];
+    }
+
+    temp0 += temp4;
+    temp1 += temp5;
+    temp2 += temp6;
+    temp3 += temp7;
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[1] += alpha * (temp1[0] + temp1[1]);
+    y[2] += alpha * (temp2[0] + temp2[1]);
+    y[3] += alpha * (temp3[0] + temp3[1]);
+
+}
+ 
+
+static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* va1 = (__vector double*) a1;
+    __vector double* v_x = (__vector double*) x;
+    __vector double temp0 = {0, 0};
+    __vector double temp1 = {0, 0};
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
+        temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]);
+}
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector double* va0 = (__vector double*) a0;
+    __vector double* v_x = (__vector double*) x;
+    __vector double temp0 = {0, 0};
+    for (i = 0; i < n / 2; i += 2) {
+        temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
+    }
+
+    *y += alpha * (temp0[0] + temp0[1]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+#if defined(PREFETCH)                
+               __builtin_prefetch(y_ptr+64);
+#endif               
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index baeb54205..951c2f9c9 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "drot_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index 779a08e9c..39293252b 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "dscal_microk_power8.c"
 #endif
+#endif
 
 #if !defined(HAVE_KERNEL_8)
 
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
index 52b7f50da..ff3f95c79 100644
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "dswap_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_32
 
diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 195a8c68e..5016f67dd 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 #include "common.h"
 #include <math.h>
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include <altivec.h>
+#endif
+
 #if defined(DOUBLE)
 
 #define ABS fabs
@@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 /**
  * Find  maximum index 
  * Warning: requirements n>0  and n % 32 == 0
@@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
     return index;
 
 }
+#endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
     BLASLONG i = 0;
@@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
         BLASLONG n1 = n & -32;
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 	if (n1 > 0) {
 
             max = diamax_kernel_32(n1, x, &maxf);
 
             i = n1;
         }
+#endif	
 #endif	
         while (i < n) {
             if (ABS(x[i]) > maxf) {
diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c
index 8a5538821..e37718c48 100644
--- a/kernel/power/idamin.c
+++ b/kernel/power/idamin.c
@@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 /**
  * Find  minimum index 
  * Warning: requirements n>0  and n % 32 == 0
@@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
     return index;
 
 }
-
+#endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
@@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
     if (inc_x == 1) {
 
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
         BLASLONG n1 = n & -32;
 	if (n1 > 0) {
 
             min = diamin_kernel_32(n1, x, &minf);
             i = n1;
         }
+#endif
 #endif
         while (i < n) {
             if (ABS(x[i]) < minf) {
diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c
index 7149da28b..fe9d5bf95 100644
--- a/kernel/power/izamax.c
+++ b/kernel/power/izamax.c
@@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
  
 /**
  * Find  maximum index 
@@ -299,7 +300,7 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
 }
  
-  
+#endif  
 
  
  
@@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     if (inc_x == 1) {
 
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
       BLASLONG n1 = n & -16;
       if (n1 > 0) {
 
@@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             i = n1;
             ix = n1 << 1;
       }
+#endif
 #endif
 
       while(i < n)
diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c
index 692315b89..94f2383e0 100644
--- a/kernel/power/izamin.c
+++ b/kernel/power/izamin.c
@@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
- 
 
 #include "common.h"
 #include <math.h>
@@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ABS fabs 
 #define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
  
 /**
  * Find  minimum index 
@@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
     return index; 
 }
 
+#endif 
  
 
  
@@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         minf = CABS1(x,0); //index will not be incremented
 
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 	BLASLONG n1 = n & -16;
         if (n1 > 0) {
 
@@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             i = n1;
             ix = n1 << 1;
         }
+#endif 
 #endif 
 
         while(i < n)
@@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     }
  
 }
-
-
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index 5908347d3..733137012 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "sasum_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_32
diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c
index 393cdfadc..3d3b1613c 100644
--- a/kernel/power/saxpy.c
+++ b/kernel/power/saxpy.c
@@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#define  offset_0 0
+#define  offset_1 16
+#define  offset_2 32
+#define  offset_3 48
+#define  offset_4 64
+#define  offset_5 80
+#define  offset_6 96
+#define  offset_7 112
+#define  offset_8 128
+#define  offset_9 144
+#define  offset_10 160
+#define  offset_11 176
+#define  offset_12 192
+#define  offset_13 208
+#define  offset_14 224
+#define  offset_15 240
  
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
 
 #ifndef HAVE_KERNEL_8
 #include <altivec.h> 
@@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
 {
     BLASLONG  i = 0;
-    __vector float v_a = {alpha,alpha,alpha,alpha}; 
-    __vector float * v_y=(__vector float *)y;
-    __vector float * v_x=(__vector float *)x;
+    __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; 
+    __vector float * vptr_y =(__vector float *)y;
+    __vector float * vptr_x =(__vector float *)x;
         
     for(; i<n/4; i+=16){
 
+
+        register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
+        register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
+        register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
+        register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
+        register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
+        register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
+        register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
+        register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
+        register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
+        register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
+        register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
+        register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
+        register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
+        register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
+        register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
+        register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
+
+        register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
+        register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
+        register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
+        register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
+        register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
+        register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
+        register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
+        register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
+        register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
+        register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
+        register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
+        register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
+        register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
+        register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
+        register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
+        register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
+        vy_0 += vx_0*v_a;
+        vy_1 += vx_1*v_a;
+        vy_2 += vx_2*v_a;
+        vy_3 += vx_3*v_a;
+        vy_4 += vx_4*v_a;
+        vy_5 += vx_5*v_a;
+        vy_6 += vx_6*v_a;
+        vy_7 += vx_7*v_a;
+        vy_8 += vx_8*v_a;
+        vy_9 += vx_9*v_a;
+        vy_10 += vx_10*v_a;
+        vy_11 += vx_11*v_a;
+        vy_12 += vx_12*v_a;
+        vy_13 += vx_13*v_a;
+        vy_14 += vx_14*v_a;
+        vy_15 += vx_15*v_a;
+
+    	vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
+        vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
+        vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
+        vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
+        vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
+        vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
+        vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
+	vec_vsx_st( vy_7, offset_7 ,vptr_y ) ; 
+    	vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
+        vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
+        vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
+        vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
+        vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
+        vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
+        vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
+	vec_vsx_st( vy_15, offset_15 ,vptr_y ) ; 
+
+        vptr_x+=16;
+	vptr_y+=16; 
+
+/*
+
         v_y[i]    += v_a * v_x[i];
         v_y[i+1]  += v_a * v_x[i+1];
         v_y[i+2]  += v_a * v_x[i+2];
@@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
         v_y[i+13] += v_a * v_x[i+13];
         v_y[i+14] += v_a * v_x[i+14];
         v_y[i+15] += v_a * v_x[i+15];
+*/
     }
 }
 #endif
+#endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
@@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	{
 
 		BLASLONG n1 = n & -64;
+#if defined(__VEC__) || defined(__ALTIVEC__)
 
 		if ( n1 )
 			saxpy_kernel_64(n1, x, y, da);
 
 		i = n1;
+#endif
 		while(i < n)
 		{
 
diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c
index 5e3fe45a5..8ff8cb329 100644
--- a/kernel/power/scopy.c
+++ b/kernel/power/scopy.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "scopy_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_32
 
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
index ae527dde9..ffeab6638 100644
--- a/kernel/power/sdot.c
+++ b/kernel/power/sdot.c
@@ -35,9 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)  || defined(POWER9)
+#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 #include "sdot_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c
new file mode 100644
index 000000000..81a5ec76b
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -0,0 +1,1334 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if defined(TRMMKERNEL)
+#define SAVE_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[3] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[3] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] = result[6] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] = result[6] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[0] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[1] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] += result[6] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] += result[6] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[1] * alpha;
+#endif
+#define KERNEL(i, j) \
+          __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
+          __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+	  BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+	REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+
+  v4sf_t valpha = { alpha, alpha, alpha, alpha };
+  N = n >> 3;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  BLASLONG K = temp / 64;
+	  for (l = 0; l < K; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      KERNEL (32, 64);
+	      KERNEL (34, 68);
+	      KERNEL (36, 72);
+	      KERNEL (38, 76);
+	      KERNEL (40, 80);
+	      KERNEL (42, 84);
+	      KERNEL (44, 88);
+	      KERNEL (46, 92);
+	      KERNEL (48, 96);
+	      KERNEL (50, 100);
+	      KERNEL (52, 104);
+	      KERNEL (54, 108);
+	      KERNEL (56, 112);
+	      KERNEL (58, 116);
+	      KERNEL (60, 120);
+	      KERNEL (62, 124);
+	      KERNEL (64, 128);
+	      KERNEL (66, 132);
+	      KERNEL (68, 136);
+	      KERNEL (70, 140);
+	      KERNEL (72, 144);
+	      KERNEL (74, 148);
+	      KERNEL (76, 152);
+	      KERNEL (78, 156);
+	      KERNEL (80, 160);
+	      KERNEL (82, 164);
+	      KERNEL (84, 168);
+	      KERNEL (86, 172);
+	      KERNEL (88, 176);
+	      KERNEL (90, 180);
+	      KERNEL (92, 184);
+	      KERNEL (94, 188);
+	      KERNEL (96, 192);
+	      KERNEL (98, 196);
+	      KERNEL (100, 200);
+	      KERNEL (102, 204);
+	      KERNEL (104, 208);
+	      KERNEL (106, 212);
+	      KERNEL (108, 216);
+	      KERNEL (110, 220);
+	      KERNEL (112, 224);
+	      KERNEL (114, 228);
+	      KERNEL (116, 232);
+	      KERNEL (118, 236);
+	      KERNEL (120, 240);
+	      KERNEL (122, 244);
+	      KERNEL (124, 248);
+	      KERNEL (126, 252);
+	      AO += 1024;
+	      BO += 512;
+	    }
+	  if ((temp & 63) >> 5)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      KERNEL (32, 64);
+	      KERNEL (34, 68);
+	      KERNEL (36, 72);
+	      KERNEL (38, 76);
+	      KERNEL (40, 80);
+	      KERNEL (42, 84);
+	      KERNEL (44, 88);
+	      KERNEL (46, 92);
+	      KERNEL (48, 96);
+	      KERNEL (50, 100);
+	      KERNEL (52, 104);
+	      KERNEL (54, 108);
+	      KERNEL (56, 112);
+	      KERNEL (58, 116);
+	      KERNEL (60, 120);
+	      KERNEL (62, 124);
+	      AO += 512;
+	      BO += 256;
+	    }
+	  if ((temp & 31) >> 4)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      AO += 256;
+	      BO += 128;
+	    }
+	  if ((temp & 15) >> 3)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      AO += 128;
+	      BO += 64;
+	    }
+	  if ((temp & 7) >> 2)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      AO += 64;
+	      BO += 32;
+	    }
+	  if ((temp & 3) >> 1)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      AO += 32;
+	      BO += 16;
+	    }
+	  if ((temp & 1) >> 0)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      AO += 16;
+	      BO += 8;
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  SAVE_ACC (&acc4, 8);
+	  SAVE_ACC (&acc6, 12);
+	  SAVE_ACC1 (&acc5, 8);
+	  SAVE_ACC1 (&acc7, 12);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 8)
+#endif
+	    CO += 16;
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  AO += (temp << 3);
+	  BO += (temp << 3);
+	  CO += 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 8)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 4;
+	  AO += (temp << 2);
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 8)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+	      vec_t *rowA = (vec_t *) & t[0];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  SAVE4x2_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += (temp << 1);
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 8)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
+		BO[(l << 3) + 3]
+	      };
+	      v4sf_t rowB1 =
+		{ BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
+		BO[(l << 3) + 7]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t[2];
+	  CO[3 * ldc] = t[3];
+	  CO[4 * ldc] = t1[0];
+	  CO[5 * ldc] = t1[1];
+	  CO[6 * ldc] = t1[2];
+	  CO[7 * ldc] = t1[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+	  CO[4 * ldc] += t1[0];
+	  CO[5 * ldc] += t1[1];
+	  CO[6 * ldc] += t1[2];
+	  CO[7 * ldc] += t1[3];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 8)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;			// number of values in A
+#endif
+
+      B += k << 3;
+    }
+  N = (n & 7) >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  FLOAT *A1;
+	  A1 = AO + (16 * k);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowA1 = (vec_t *) & A1[l << 4];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	      __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+	      __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+	      __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+	      __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc4, 0);
+	  SAVE_ACC (&acc5, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc6, 0);
+	  SAVE_ACC (&acc7, 4);
+	  CO += 8;
+	  AO += k << 5;
+	  BO += k << 2;
+	}
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  AO += temp << 4;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 4)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 4)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  __vector_quad acc0;
+	  v4sf_t result[4];
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 4)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+	      vec_t *rowA = (vec_t *) & t[0];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 4)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 4)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
+		BO[(l << 2) + 3]
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t[2];
+	  CO[3 * ldc] = t[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 4)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;			// number of values in A
+#endif
+
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  FLOAT *A1;
+	  A1 = AO + (16 * k);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowA1 = (vec_t *) & A1[l << 4];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	      __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+	      __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+	      __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+	      __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  SAVE2x4_ACC (&acc4, 0);
+	  SAVE2x4_ACC (&acc5, 4);
+	  SAVE2x4_ACC (&acc6, 8);
+	  SAVE2x4_ACC (&acc7, 12);
+	  CO += 16;
+	  AO += k << 5;
+	  BO += k << 1;
+	}
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  AO += temp << 4;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 2)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 2)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < (temp << 1); l += 2)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
+	      v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[0 * ldc + 1] = t[2];
+	  CO[1 * ldc + 1] = t[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[0 * ldc + 1] += t[2];
+	  CO[1 * ldc + 1] += t[3];
+#endif
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], 0, 0 };
+	      v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;			// number of values in A
+#endif
+
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  v4sf_t t2 = { 0, 0, 0, 0 };
+	  v4sf_t t3 = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
+		AO[(l << 4) + 3]
+	      };
+	      v4sf_t rowA1 =
+		{ AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
+		AO[(l << 4) + 7]
+	      };
+	      v4sf_t rowA2 =
+		{ AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
+		AO[(l << 4) + 11]
+	      };
+	      v4sf_t rowA3 =
+		{ AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
+		AO[(l << 4) + 15]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+	  CO[4] = t1[0];
+	  CO[5] = t1[1];
+	  CO[6] = t1[2];
+	  CO[7] = t1[3];
+	  CO[8] = t2[0];
+	  CO[9] = t2[1];
+	  CO[10] = t2[2];
+	  CO[11] = t2[3];
+	  CO[12] = t3[0];
+	  CO[13] = t3[1];
+	  CO[14] = t3[2];
+	  CO[15] = t3[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+	  CO[8] += t2[0];
+	  CO[9] += t2[1];
+	  CO[10] += t2[2];
+	  CO[11] += t2[3];
+	  CO[12] += t3[0];
+	  CO[13] += t3[1];
+	  CO[14] += t3[2];
+	  CO[15] += t3[3];
+#endif
+	  AO += temp << 4;
+	  BO += temp;
+	  CO += 16;
+	  i -= 16;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 1)
+#endif
+	}
+      while (i >= 8)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
+		AO[(l << 3) + 3]
+	      };
+	      v4sf_t rowA1 =
+		{ AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
+		AO[(l << 3) + 7]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+	  CO[4] = t1[0];
+	  CO[5] = t1[1];
+	  CO[6] = t1[2];
+	  CO[7] = t1[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+#endif
+	  AO += temp << 3;
+	  BO += temp;
+	  CO += 8;
+	  i -= 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      while (i >= 4)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
+		AO[(l << 2) + 3]
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+#endif
+	  AO += temp << 2;
+	  BO += temp;
+	  CO += 4;
+	  i -= 4;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 1)
+#endif
+	}
+      while (i >= 2)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], 0, 0 };
+	      v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+#endif
+	  AO += temp << 1;
+	  BO += temp;
+	  CO += 2;
+	  i -= 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      while (i >= 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  BLASLONG l = 0;
+	  FLOAT t = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      t += AO[l] * BO[l];
+	    }
+	  AO += temp;
+	  BO += temp;
+#if defined(TRMMKERNEL)
+	  CO[0] = t * alpha;
+#else
+	  CO[0] += t * alpha;
+#endif
+	  CO += 1;
+	  i -= 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;			// number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}
diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c
index 81ac031a3..5dfb18f5b 100644
--- a/kernel/power/sgemv_n.c
+++ b/kernel/power/sgemv_n.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_n.c"
 
+#else
 
 #include "common.h"
 
@@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	return(0);
 }
 
+#endif
 
diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c
index 3d8a442dc..62c517a9d 100644
--- a/kernel/power/sgemv_t.c
+++ b/kernel/power/sgemv_t.c
@@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_t.c"
+
+#else
 
 #include "common.h"
 
@@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 }
 
+#endif
diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c
new file mode 100644
index 000000000..1ae9e04bf
--- /dev/null
+++ b/kernel/power/shgemm_kernel_power10.c
@@ -0,0 +1,1044 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+#if defined(HALF) && defined(HALFCONVERSION)
+static float
+bfloat16tof32 (bfloat16 f16)
+{
+  float result = 0;
+  unsigned short *q = (unsigned short *) (&result);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  q[0] = f16;
+#else
+  q[1] = f16;
+#endif
+  return result;
+}
+
+#define BF16TOF32(x) (bfloat16tof32(x))
+#else
+#define BF16TOF32(x) x
+#endif
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+vector char mask =
+  { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe,
+  0xf
+};
+
+/* 
+ * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of
+ * bfloat16 floating-point values as input. Hence this
+ * merging is needed on A and B matrices. 
+ */
+#define MERGE_ROW(x) vec_perm(x, x, mask)
+#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y)
+#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
+
+#define SAVE_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[3] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] += result[6] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] += result[6] * alpha;
+
+#define MMA __builtin_mma_xvbf16ger2pp
+
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[0] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[1] * alpha;
+
+#define SET_ACC_ZERO4() \
+	  __builtin_mma_xxsetaccz (&acc0); \
+	  __builtin_mma_xxsetaccz (&acc1); \
+	  __builtin_mma_xxsetaccz (&acc2); \
+	  __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+	  __builtin_mma_xxsetaccz (&acc0); \
+	  __builtin_mma_xxsetaccz (&acc1); \
+	  __builtin_mma_xxsetaccz (&acc2); \
+	  __builtin_mma_xxsetaccz (&acc3); \
+	  __builtin_mma_xxsetaccz (&acc4); \
+	  __builtin_mma_xxsetaccz (&acc5); \
+	  __builtin_mma_xxsetaccz (&acc6); \
+	  __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+/*************************************************************************************
+* SHGEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
+       IFLOAT * B, FLOAT * C, BLASLONG ldc)
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+  v4sf_t valpha = { alpha, alpha, alpha, alpha };
+  vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  N = n >> 3;
+  /* Loop for n >= 8. */
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j;
+      FLOAT *CO;
+      IFLOAT *AO;
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      /* Loop for m >= 16. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 5]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 4]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]);
+	      vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]);
+	      vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]);
+	      vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]);
+	      MMA (&acc0, rowB_h, rowA_h);
+	      MMA (&acc1, rowB_l, rowA_h);
+	      MMA (&acc2, rowB_h, rowA_l);
+	      MMA (&acc3, rowB_l, rowA_l);
+	      MMA (&acc4, rowB_h, rowA2_h);
+	      MMA (&acc5, rowB_l, rowA2_h);
+	      MMA (&acc6, rowB_h, rowA2_l);
+	      MMA (&acc7, rowB_l, rowA2_l);
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 4;
+	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
+	      vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
+	      vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero);
+	      vec_t rowA2_l = MERGE_LOW (rowA[1], vzero);
+	      MMA (&acc0, rowB_h, rowA_h);
+	      MMA (&acc1, rowB_l, rowA_h);
+	      MMA (&acc2, rowB_h, rowA_l);
+	      MMA (&acc3, rowB_l, rowA_l);
+	      MMA (&acc4, rowB_h, rowA2_h);
+	      MMA (&acc5, rowB_l, rowA2_h);
+	      MMA (&acc6, rowB_h, rowA2_l);
+	      MMA (&acc7, rowB_l, rowA2_l);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  SAVE_ACC (&acc4, 8);
+	  SAVE_ACC (&acc6, 12);
+	  SAVE_ACC1 (&acc5, 8);
+	  SAVE_ACC1 (&acc7, 12);
+	  CO += 16;
+
+	  AO += (k << 4);
+	  BO += (k << 3);
+	}
+      i = (m & 15) >> 3;
+      /* Loop for m >= 8. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 4]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 4]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]);
+	      vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]);
+	      MMA (&acc0, rowB_h, rowA_h);
+	      MMA (&acc1, rowB_l, rowA_h);
+	      MMA (&acc2, rowB_h, rowA_l);
+	      MMA (&acc3, rowB_l, rowA_l);
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 4;
+	      vec_t *rowA = (vec_t *) & (AO[l]);
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
+	      vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
+	      MMA (&acc0, rowB_h, rowA_h);
+	      MMA (&acc1, rowB_l, rowA_h);
+	      MMA (&acc2, rowB_h, rowA_l);
+	      MMA (&acc3, rowB_l, rowA_l);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  CO += 8;
+	  AO += (k << 3);
+	  BO += (k << 3);
+	}
+      i = (m & 7) >> 2;
+      /* Loop for m >= 4. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 4]);
+	      vec_t rowA_mrg = MERGE_ROW (rowA[0]);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg);
+	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg);
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 3;
+	      vector short rowA =
+		{ AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
+	      vec_t *rowB = (vec_t *) & (BO[l << 1]);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 4;
+	  AO += (k << 2);
+	  BO += (k << 3);
+	}
+      i = (m & 3) >> 1;
+      /* Loop for m >= 2. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowA =
+		{ AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1],
+		AO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowB = (vec_t *) & (BO[l << 4]);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowB = (vec_t *) & (BO[(l << 2)]);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  SAVE4x2_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += (k << 1);
+	  BO += (k << 3);
+	}
+      i = (m & 1) >> 0;
+      /* Loop for m = 1. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 }
+	  , t1 =
+	  {
+	  0, 0, 0, 0};
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]),
+		BF16TOF32 (AO[l])
+	      };
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]),
+		BF16TOF32 (BO[(l << 3) + 2]),
+		BF16TOF32 (BO[(l << 3) + 3])
+	      };
+	      v4sf_t rowB1 =
+		{ BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]),
+		BF16TOF32 (BO[(l << 3) + 6]),
+		BF16TOF32 (BO[(l << 3) + 7])
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+	  CO[4 * ldc] += t1[0];
+	  CO[5 * ldc] += t1[1];
+	  CO[6 * ldc] += t1[2];
+	  CO[7 * ldc] += t1[3];
+	  CO += 1;
+	  AO += k;
+	  BO += (k << 3);
+	}
+      B += k << 3;
+    }
+  N = (n & 7) >> 2;
+  /* Loop for n >= 4. */
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j;
+      FLOAT *CO;
+      IFLOAT *AO;
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+      i = m >> 5;
+      /* Loop for m >= 32. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  IFLOAT *A1 = AO + (16 * k);
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 5]);
+	      vec_t *rowA1 = (vec_t *) & (A1[l << 5]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 3]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3]));
+	      MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2]));
+	      MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2]));
+	      MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3]));
+	      MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 3;
+	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
+	      vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]);
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
+	      MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
+	      MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero));
+	      MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
+	      MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero));
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc4, 0);
+	  SAVE_ACC (&acc5, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc6, 0);
+	  SAVE_ACC (&acc7, 4);
+	  CO += 8;
+	  AO += k << 5;
+	  BO += k << 2;
+	}
+      i = (m & 31) >> 4;
+      /* Loop for m >= 16. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 5]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 3]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 3;
+	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  AO += k << 4;
+	  BO += k << 2;
+	}
+      i = (m & 15) >> 3;
+      /* Loop for m >= 8. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 4]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 3]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1]));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 3;
+	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
+	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += k << 3;
+	  BO += k << 2;
+	}
+      i = (m & 7) >> 2;
+      /* Loop for m >= 4. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  __vector_quad acc0;
+	  v4sf_t result[4];
+	  BLASLONG l = 0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
+	      vec_t *rowB = (vec_t *) & (BO[l << 3]);
+	      MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 3;
+	      vector short rowA =
+		{ AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
+	      vec_t *rowB = (vec_t *) & (BO[l]);
+	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += k << 2;
+	  BO += k << 2;
+	}
+      i = (m & 3) >> 1;
+      /* Loop for m >= 2. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0;
+	  BLASLONG l = 0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowA =
+		{ AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1],
+		AO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowB = (vec_t *) & (BO[l << 3]);
+	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowB = (vec_t *) & (BO[l << 1]);
+	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += k << 1;
+	  BO += k << 2;
+	}
+      i = (m & 1) >> 0;
+      /* Loop for m = 1. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]),
+		BF16TOF32 (AO[l])
+	      };
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]),
+		BF16TOF32 (BO[(l << 2) + 2]),
+		BF16TOF32 (BO[(l << 2) + 3])
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+	  AO += k;
+	  BO += (k << 2);
+	  CO += 1;
+	}
+
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  /* Loop for n >= 2. */
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j;
+      FLOAT *CO;
+      IFLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+      i = m >> 5;
+      /* Loop for m >= 32. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  IFLOAT *A1 = AO + (16 * k);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowB =
+		{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
+		BO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowA = (vec_t *) & (AO[l << 5]);
+	      vec_t *rowA1 = (vec_t *) & (A1[l << 5]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	      MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2]));
+	      MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2]));
+	      MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3]));
+	      MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
+	      vec_t *rowA1 = (vec_t *) & (A1[l << 3]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	      MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2]));
+	      MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2]));
+	      MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3]));
+	      MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3]));
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  SAVE2x4_ACC (&acc4, 0);
+	  SAVE2x4_ACC (&acc5, 4);
+	  SAVE2x4_ACC (&acc6, 8);
+	  SAVE2x4_ACC (&acc7, 12);
+	  CO += 16;
+	  AO += k << 5;
+	  BO += k << 1;
+	}
+      i = (m & 31) >> 4;
+      /* Loop for m >= 16. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowB =
+		{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
+		BO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowA = (vec_t *) & (AO[l << 5]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  AO += k << 4;
+	  BO += k << 1;
+	}
+      i = (m & 15) >> 3;
+      /* Loop for m >= 8. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowB =
+		{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
+		BO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowA = (vec_t *) & (AO[l << 4]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += k << 3;
+	  BO += k << 1;
+	}
+      i = (m & 7) >> 2;
+      /* Loop for m >= 4. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < k / 2; l++)
+	    {
+	      vector short rowB =
+		{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
+		BO[(l << 2) + 3],
+		0, 0, 0, 0
+	      };
+	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+	    }
+	  if (k % 2 == 1)
+	    {
+	      if (k > 1)
+		l = (k / 2) << 2;
+	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
+	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
+	      MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += k << 2;
+	  BO += k << 1;
+	}
+      i = (m & 3) >> 1;
+      /* Loop for m >= 2. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < (k << 1); l += 2)
+	    {
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]),
+		BF16TOF32 (AO[l + 1])
+	      };
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]),
+		BF16TOF32 (BO[l + 1])
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[0 * ldc + 1] += t[2];
+	  CO[1 * ldc + 1] += t[3];
+	  CO += 2;
+	  AO += k << 1;
+	  BO += k << 1;
+	}
+      i = (m & 1) >> 0;
+      /* Loop for m = 1. */
+      for (j = 0; j < i; j++)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 };
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0,
+		0
+	      };
+	      t += rowA * rowB;
+	    }
+	  CO[0 * ldc] += t[0] * alpha;
+	  CO[1 * ldc] += t[1] * alpha;
+	  CO += 1;
+	  AO += k;
+	  BO += k << 1;
+	}
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  /* Loop for n = 1. */
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i;
+      FLOAT *CO;
+      IFLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      /* Loop for m >= 16. */
+      while (i >= 16)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  v4sf_t t2 = { 0, 0, 0, 0 };
+	  v4sf_t t3 = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]),
+		BF16TOF32 (BO[l])
+	      };
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]),
+		BF16TOF32 (AO[(l << 4) + 2]),
+		BF16TOF32 (AO[(l << 4) + 3])
+	      };
+	      v4sf_t rowA1 =
+		{ BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]),
+		BF16TOF32 (AO[(l << 4) + 6]),
+		BF16TOF32 (AO[(l << 4) + 7])
+	      };
+	      v4sf_t rowA2 =
+		{ BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]),
+		BF16TOF32 (AO[(l << 4) + 10]),
+		BF16TOF32 (AO[(l << 4) + 11])
+	      };
+	      v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]),
+		BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]),
+		BF16TOF32 (AO[(l << 4) + 15])
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+	  CO[8] += t2[0];
+	  CO[9] += t2[1];
+	  CO[10] += t2[2];
+	  CO[11] += t2[3];
+	  CO[12] += t3[0];
+	  CO[13] += t3[1];
+	  CO[14] += t3[2];
+	  CO[15] += t3[3];
+	  AO += k << 4;
+	  BO += k;
+	  CO += 16;
+	  i -= 16;
+	}
+      /* Loop for m >= 8. */
+      while (i >= 8)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]),
+		BF16TOF32 (BO[l])
+	      };
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]),
+		BF16TOF32 (AO[(l << 3) + 2]),
+		BF16TOF32 (AO[(l << 3) + 3])
+	      };
+	      v4sf_t rowA1 =
+		{ BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]),
+		BF16TOF32 (AO[(l << 3) + 6]),
+		BF16TOF32 (AO[(l << 3) + 7])
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+	  AO += k << 3;
+	  BO += k;
+	  CO += 8;
+	  i -= 8;
+	}
+      /* Loop for m >= 4. */
+      while (i >= 4)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowB =
+		{ BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]),
+		BF16TOF32 (BO[l])
+	      };
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]),
+		BF16TOF32 (AO[(l << 2) + 2]),
+		BF16TOF32 (AO[(l << 2) + 3])
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  AO += k << 2;
+	  BO += k;
+	  CO += 4;
+	  i -= 4;
+	}
+      /* Loop for m >= 2. */
+      while (i >= 2)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < k; l++)
+	    {
+	      v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 };
+	      v4sf_t rowA =
+		{ BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0,
+		0
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  AO += k << 1;
+	  BO += k;
+	  CO += 2;
+	  i -= 2;
+	}
+      /* Loop for m = 1. */
+      while (i >= 1)
+	{
+	  IFLOAT *BO = B;
+	  BLASLONG l = 0;
+	  FLOAT t = 0;
+	  for (l = 0; l < k; l++)
+	    {
+	      t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]);
+	    }
+	  AO += k;
+	  BO += k;
+	  CO[0] += t * alpha;
+	  CO += 1;
+	  i -= 1;
+	}
+
+      B += k;
+    }
+
+  return 0;
+}
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index 6af813c16..a53342f61 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "srot_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index 4f3ba5698..de37e10a5 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "sscal_microk_power8.c"
 #endif
+#endif
 
 
 #if !defined(HAVE_KERNEL_16)
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
index 23d13280f..44522f0a0 100644
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "sswap_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_32
 
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
index f61c62e75..305e50ede 100644
--- a/kernel/power/zasum.c
+++ b/kernel/power/zasum.c
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "zasum_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_8
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
index f0f8c6910..3064d5435 100644
--- a/kernel/power/zaxpy.c
+++ b/kernel/power/zaxpy.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "zaxpy_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_4
diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c
index b21d6ef15..453f4e551 100644
--- a/kernel/power/zcopy.c
+++ b/kernel/power/zcopy.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "zcopy_microk_power8.c"
 #endif
+#endif
 
 #ifndef HAVE_KERNEL_16
 
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
index fd36c7f44..fe0e9284e 100644
--- a/kernel/power/zdot.c
+++ b/kernel/power/zdot.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "zdot_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_8
@@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
 	FLOAT  dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; 
 
 	if ( n <= 0 ) 
-	{
+	{	/*
 	        __real__ result = 0.0 ;
         	__imag__ result = 0.0 ;
+		*/
+		result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
 		return(result);
 
 	}
@@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
 	}
 
 #if !defined(CONJ)
+	/*
 	__real__ result = dot[0] - dot[1];
 	__imag__ result = dot[2] + dot[3];
+	*/
+	result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
 #else
+	/*
 	__real__ result = dot[0] + dot[1];
 	__imag__ result = dot[2] - dot[3];
+	*/
+	result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
 
 #endif
 
diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S
new file mode 100644
index 000000000..fca389e69
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power10.S
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD	ld
+ 
+#define STACKSIZE 512
+
+#define FZERO	312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+ 
+ 
+
+#define o0	0
+#define alpha_r vs62
+#define alpha_i vs63
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L	r15
+#define T8	r16
+#define T5	r17
+#define T2	r19
+#define TEMP_REG	r20
+#define	T6	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define T7	r27
+#define	T3	r28
+#define T4	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+    addi    SP, SP, -STACKSIZE 
+    mflr    r0
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
+    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs20,  288(SP)
+    stxv    vs21,  304(SP)
+    stxv    vs22,  320(SP)
+    stxv    vs23,  336(SP)
+    stxv    vs24,  352(SP)
+    stxv    vs25,  368(SP)
+    stxv    vs26,  384(SP)
+    stxv    vs27,  400(SP)
+    stxv    vs28,  416(SP)
+    stxv    vs29,  432(SP)
+    stxv    vs30,  448(SP)
+    stxv    vs31,  464(SP)
+
+    std    r0, FLINK_SAVE(SP)
+ 
+
+#if defined(linux) || defined(__FreeBSD__)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif 
+#endif
+
+
+#include "zgemm_macros_power10.S"
+
+ 
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE,  512 
+    li  r0,   0
+ 
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegdp alpha_r,alpha_r
+  xvnegdp alpha_i,alpha_i
+#endif
+	.align 4
+
+#include "zgemm_logic_power10.S"
+
+L999:
+ 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs20,  288(SP)
+    lxv    vs21,  304(SP)
+    lxv    vs22,  320(SP)
+    lxv    vs23,  336(SP)
+    lxv    vs24,  352(SP)
+    lxv    vs25,  368(SP)
+    lxv    vs26,  384(SP) 
+    lxv    vs27,  400(SP)
+	mtlr r0
+    lxv    vs28,  416(SP)
+    lxv    vs29,  432(SP) 
+    lxv    vs30,  448(SP)
+    lxv    vs31,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S
new file mode 100644
index 000000000..1143733e0
--- /dev/null
+++ b/kernel/power/zgemm_logic_power10.S
@@ -0,0 +1,1735 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_2 0, 0
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL2x8_2 2, 0
+    KERNEL2x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_2 4, 0
+    KERNEL2x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL2x8_2 6, 0
+    KERNEL2x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_2 8, 0
+    KERNEL2x8_2 9, 0
+    KERNEL2x8_2 10, 0
+    KERNEL2x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL2x8_2 12, 0
+    KERNEL2x8_2 13, 0
+    KERNEL2x8_2 14, 0
+    KERNEL2x8_2 15, 0
+    KERNEL2x8_2 16, 0
+    KERNEL2x8_2 17, 0
+    KERNEL2x8_2 18, 0
+    KERNEL2x8_2 19, 0
+    KERNEL2x8_2 20, 0
+    KERNEL2x8_2 21, 0
+    KERNEL2x8_2 22, 0
+    KERNEL2x8_2 23, 0
+    KERNEL2x8_2 24, 0
+    KERNEL2x8_2 25, 0
+    KERNEL2x8_2 26, 0
+    KERNEL2x8_2 27, 0
+    KERNEL2x8_2 28, 0
+    KERNEL2x8_2 29, 0
+    KERNEL2x8_2 30, 0
+    KERNEL2x8_2 31, 0
+    KERNEL2x8_2 32, 0
+    KERNEL2x8_2 33, 0
+    KERNEL2x8_2 34, 0
+    KERNEL2x8_2 35, 0
+    KERNEL2x8_2 36, 0
+    KERNEL2x8_2 37, 0
+    KERNEL2x8_2 38, 0
+    KERNEL2x8_2 39, 0
+    KERNEL2x8_2 40, 0
+    KERNEL2x8_2 41, 0
+    KERNEL2x8_2 42, 0
+    KERNEL2x8_2 43, 0
+    KERNEL2x8_2 44, 0
+    KERNEL2x8_2 45, 0
+    KERNEL2x8_2 46, 0
+    KERNEL2x8_2 47, 0
+    KERNEL2x8_2 48, 0
+    KERNEL2x8_2 49, 0
+    KERNEL2x8_2 50, 0
+    KERNEL2x8_2 51, 0
+    KERNEL2x8_2 52, 0
+    KERNEL2x8_2 53, 0
+    KERNEL2x8_2 54, 0
+    KERNEL2x8_2 55, 0
+    KERNEL2x8_2 56, 0
+    KERNEL2x8_2 57, 0
+    KERNEL2x8_2 58, 0
+    KERNEL2x8_2 59, 0
+    KERNEL2x8_2 60, 0
+    KERNEL2x8_2 61, 0
+    KERNEL2x8_2 62, 0
+    KERNEL2x8_2 63, 1
+    bdz     ZGEMM_L2x8_LOOP_END
+    b       ZGEMM_L2x8_LOOP
+    MY_ALIGN  
+
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL2x8_2 0, 1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_2 0, 0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_2 1, 0
+    KERNEL2x4_2 2, 0
+    KERNEL2x4_2 3, 0
+    KERNEL2x4_2 4, 0
+    KERNEL2x4_2 5, 0
+    KERNEL2x4_2 6, 0
+    KERNEL2x4_2 7, 0
+    KERNEL2x4_2 8, 0
+    KERNEL2x4_2 9, 0
+    KERNEL2x4_2 10, 0
+    KERNEL2x4_2 11, 0
+    KERNEL2x4_2 12, 0
+    KERNEL2x4_2 13, 0
+    KERNEL2x4_2 14, 0
+    KERNEL2x4_2 15, 1
+    bdnz    ZGEMM_L2x4_LOOP
+    MY_ALIGN  
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL2x4_2 0, 1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN 
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_2 0, 0 
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_2 1, 0  
+    KERNEL2x2_2 2, 0
+    KERNEL2x2_2 3, 0  
+    KERNEL2x2_2 4, 0
+    KERNEL2x2_2 5, 0 
+    KERNEL2x2_2 6, 0
+    KERNEL2x2_2 7, 0
+    KERNEL2x2_2 8, 0
+    KERNEL2x2_2 9, 0  
+    KERNEL2x2_2 10, 0
+    KERNEL2x2_2 11, 0  
+    KERNEL2x2_2 12, 0
+    KERNEL2x2_2 13, 0 
+    KERNEL2x2_2 14, 0
+    KERNEL2x2_2 15, 1   
+    bdnz    ZGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL2x2_2 0, 1
+    blr
+    MY_ALIGN
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32, 64, 0, 0 
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32, 64, 1, 0  
+    KERNEL2x1_L2 32, 64, 2, 0
+    KERNEL2x1_L2 32, 64, 3, 0  
+    KERNEL2x1_L2 32, 64, 4, 0
+    KERNEL2x1_L2 32, 64, 5, 0 
+    KERNEL2x1_L2 32, 64, 6, 0
+    KERNEL2x1_L2 32, 64, 7, 0
+    KERNEL2x1_L2 32, 64, 8, 0
+    KERNEL2x1_L2 32, 64, 9, 0  
+    KERNEL2x1_L2 32, 64, 10, 0
+    KERNEL2x1_L2 32, 64, 11, 0  
+    KERNEL2x1_L2 32, 64, 12, 0
+    KERNEL2x1_L2 32, 64, 13, 0 
+    KERNEL2x1_L2 32, 64, 14, 0
+    KERNEL2x1_L2 32, 64, 15, 1   
+    bdnz    ZGEMM_L2x1_LOOP
+    MY_ALIGN  
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J, N, 1
+    bgt   ZGEMM_L2_BEGIN
+    b     ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC, 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I, M, 3
+    bgt   ZGEMM_L2_BEGIN_CONTINUE
+    b     ZGEMM_L2x8_END
+
+ZGEMM_L2_BEGIN_CONTINUE:
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 8, 2
+#else    
+    mr    BO, B  
+    dcbt    B, r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2
+    mr T1, T6
+#else   
+    mr T1, K
+#endif   
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /* T8 <- T1 % 128 */
+
+    KERNEL2x8_PRELOAD
+    KERNEL2x8_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L2x8_SUB0
+    bl ZGEMM_L2x8_LMAIN_SUB
+    andi.   L, T1, 127
+
+    bgt   ZGEMM_L2x8_BEGIN_CONTINUE
+    b     ZGEMM_L2x8_SAVE
+
+ZGEMM_L2x8_BEGIN_CONTINUE:
+    b   ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 255
+    cmpwi   T6, 129
+#else   
+    andi.   L, K, 255
+    cmpwi   K, 129
+#endif       
+    li T8, 1
+    bne CMP2x8_128K
+    LOAD_END_2x8 128, 32
+    KERNEL2x8_PRELOAD
+    addi BO, BO, -64
+    addi AO,AO, -256   
+    mtctr   T8    
+    bl ZGEMM_L2x8_K128
+    b ZGEMM_L2x8_SAVE  
+
+CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 128
+#else    
+    cmpwi   K, 128
+#endif        
+    bne ZGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -64
+    addi AO,AO, -256   
+    bl ZGEMM_L2x8_K128
+    b ZGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L2x8_SUB2_32
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_2 0, 0
+    KERNEL2x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL2x8_2 2, 0
+    KERNEL2x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_2 4, 0
+    KERNEL2x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL2x8_2 6, 0
+    KERNEL2x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_2 8, 0
+    KERNEL2x8_2 9, 0
+    KERNEL2x8_2 10, 0
+    KERNEL2x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL2x8_2 12, 0
+    KERNEL2x8_2 13, 0
+    KERNEL2x8_2 14, 0
+    KERNEL2x8_2 15, 0
+    KERNEL2x8_2 16, 0
+    KERNEL2x8_2 17, 0
+    KERNEL2x8_2 18, 0
+    KERNEL2x8_2 19, 0
+    KERNEL2x8_2 20, 0
+    KERNEL2x8_2 21, 0
+    KERNEL2x8_2 22, 0
+    KERNEL2x8_2 23, 0
+    KERNEL2x8_2 24, 0
+    KERNEL2x8_2 25, 0
+    KERNEL2x8_2 26, 0
+    KERNEL2x8_2 27, 0
+    KERNEL2x8_2 28, 0
+    KERNEL2x8_2 29, 0
+    KERNEL2x8_2 30, 0
+    KERNEL2x8_2 31, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L2x8_SUB2_16    
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_2 0, 0
+    KERNEL2x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL2x8_2 2, 0
+    KERNEL2x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_2 4, 0
+    KERNEL2x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL2x8_2 6, 0
+    KERNEL2x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_2 8, 0
+    KERNEL2x8_2 9, 0
+    KERNEL2x8_2 10, 0
+    KERNEL2x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL2x8_2 12, 0
+    KERNEL2x8_2 13, 0
+    KERNEL2x8_2 14, 0
+    KERNEL2x8_2 15, 1
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x8_SUB2_8
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_2 0, 0
+    KERNEL2x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL2x8_2 2, 0
+    KERNEL2x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_2 4, 0
+    KERNEL2x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL2x8_2 6, 0
+    KERNEL2x8_2 7, 1
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x8_SUB2_4
+    KERNEL2x8_2 0, 0
+    KERNEL2x8_2 1, 0
+    KERNEL2x8_2 2, 0
+    KERNEL2x8_2 3, 1
+    MY_ALIGN   
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x8_SUB2_2
+    KERNEL2x8_2 0, 0
+    KERNEL2x8_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x8_SUB2_1
+    KERNEL2x8_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x8_SAVE 
+    LOAD_END_2x8 128, 32
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I, I, -1
+    KERNEL2x8_UNPRIME_MMA
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2
+#endif     
+
+    ble   ZGEMM_L2x8_SAVE_CONTINUE
+    b     ZGEMM_L2x8_BEGIN
+
+ZGEMM_L2x8_SAVE_CONTINUE:
+    andi.   T2, M, 7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M, 4
+    ble   ZGEMM_L2x4_END
+    b   ZGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M, 7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M, 4
+    ble   ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 4, 2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    KERNEL2x4_PRELOAD
+    KERNEL2x4_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L2x4_SUB0 
+    bl ZGEMM_2x4_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L2x4_SAVE
+    b    ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP2x4_32K
+    LOAD_END_2x4 64, 32
+    KERNEL2x4_PRELOAD
+    addi BO, BO, -64
+    addi AO,AO, -128
+    mtctr   T8    
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -64
+    addi AO,AO, -128
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x4_SUB2_8
+    KERNEL2x4_2 0, 0
+    KERNEL2x4_2 1, 0
+    KERNEL2x4_2 2, 0
+    KERNEL2x4_2 3, 0
+    KERNEL2x4_2 4, 0
+    KERNEL2x4_2 5, 0
+    KERNEL2x4_2 6, 0
+    KERNEL2x4_2 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x4_SUB2_4
+    KERNEL2x4_2 0, 0
+    KERNEL2x4_2 1, 0
+    KERNEL2x4_2 2, 0
+    KERNEL2x4_2 3, 1
+    MY_ALIGN  
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x4_SUB2_2
+    KERNEL2x4_2 0, 0
+    KERNEL2x4_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x4_SUB2_1
+    KERNEL2x4_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x4_SAVE 
+    LOAD_END_2x4 64, 32
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    KERNEL2x4_UNPRIME_MMA
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2
+#endif     
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M, 2
+    ble   ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 2, 2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    KERNEL2x2_PRELOAD
+    KERNEL2x2_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L2x2_SUB0 
+    bl ZGEMM_2x2_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L2x2_SAVE
+    b   ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP2x2_32K
+    LOAD_END_2x2 32, 32
+    KERNEL2x2_PRELOAD
+    addi BO, BO, -64
+    addi AO,AO, -64
+    mtctr   T8    
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -64
+    addi AO,AO, -64
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x2_SUB2_8
+    KERNEL2x2_2 0, 0
+    KERNEL2x2_2 1, 0  
+    KERNEL2x2_2 2, 0
+    KERNEL2x2_2 3, 0  
+    KERNEL2x2_2 4, 0
+    KERNEL2x2_2 5, 0 
+    KERNEL2x2_2 6, 0
+    KERNEL2x2_2 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x2_SUB2_4
+    KERNEL2x2_2 0, 0
+    KERNEL2x2_2 1, 0  
+    KERNEL2x2_2 2, 0
+    KERNEL2x2_2 3, 1  
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x2_SUB2_2
+    KERNEL2x2_2 0, 0
+    KERNEL2x2_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x2_SUB2_1
+    KERNEL2x2_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x2_SAVE 
+    LOAD_END_2x2 32, 32
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    KERNEL2x2_UNPRIME_MMA
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2
+#endif     
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M, 1
+    ble   ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 1, 2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   ZGEMM_L2x1_SUB0 
+    bl ZGEMM_2x1_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L2x1_SAVE
+    b   ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP2x1_32K
+    addi BO, BO, -32
+    addi AO,AO, -16  
+    LOAD2x1O 16, 32 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  32, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -64
+    addi AO,AO, -32   
+    LOAD2x1_2O 32, 64
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x1_SUB2_8
+    LOAD2x1_2
+    KERNEL2x1_L2 32, 64, 0, 0
+    KERNEL2x1_L2 32, 64, 1, 0  
+    KERNEL2x1_L2 32, 64, 2, 0
+    KERNEL2x1_L2 32, 64, 3, 0  
+    KERNEL2x1_L2 32, 64, 4, 0
+    KERNEL2x1_L2 32, 64, 5, 0 
+    KERNEL2x1_L2 32, 64, 6, 0
+    KERNEL2x1_E2 32, 64, 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x1_SUB2_4
+    LOAD2x1_2
+    KERNEL2x1_L2 32, 64, 0, 0
+    KERNEL2x1_L2 32, 64, 1, 0  
+    KERNEL2x1_L2 32, 64, 2, 0
+    KERNEL2x1_E2 32, 64, 3, 1  
+    MY_ALIGN  
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  32, 64, 0, 0
+    KERNEL2x1_E2  32, 64, 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  32, 64, 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/   
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2
+#endif   
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K, 5
+    addic.    J, J, -1
+    add   B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2 
+#endif   
+    ble   ZGEMM_L2_END 
+    b     ZGEMM_L2_BEGIN
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_2 0, 0
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL1x8_2 2, 0
+    KERNEL1x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_2 4, 0
+    KERNEL1x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL1x8_2 6, 0
+    KERNEL1x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_2 8, 0
+    KERNEL1x8_2 9, 0
+    KERNEL1x8_2 10, 0
+    KERNEL1x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL1x8_2 12, 0
+    KERNEL1x8_2 13, 0
+    KERNEL1x8_2 14, 0
+    KERNEL1x8_2 15, 0
+    KERNEL1x8_2 16, 0
+    KERNEL1x8_2 17, 0
+    KERNEL1x8_2 18, 0
+    KERNEL1x8_2 19, 0
+    KERNEL1x8_2 20, 0
+    KERNEL1x8_2 21, 0
+    KERNEL1x8_2 22, 0
+    KERNEL1x8_2 23, 0
+    KERNEL1x8_2 24, 0
+    KERNEL1x8_2 25, 0
+    KERNEL1x8_2 26, 0
+    KERNEL1x8_2 27, 0
+    KERNEL1x8_2 28, 0
+    KERNEL1x8_2 29, 0
+    KERNEL1x8_2 30, 0
+    KERNEL1x8_2 31, 0
+    KERNEL1x8_2 32, 0
+    KERNEL1x8_2 33, 0
+    KERNEL1x8_2 34, 0
+    KERNEL1x8_2 35, 0
+    KERNEL1x8_2 36, 0
+    KERNEL1x8_2 37, 0
+    KERNEL1x8_2 38, 0
+    KERNEL1x8_2 39, 0
+    KERNEL1x8_2 40, 0
+    KERNEL1x8_2 41, 0
+    KERNEL1x8_2 42, 0
+    KERNEL1x8_2 43, 0
+    KERNEL1x8_2 44, 0
+    KERNEL1x8_2 45, 0
+    KERNEL1x8_2 46, 0
+    KERNEL1x8_2 47, 0
+    KERNEL1x8_2 48, 0
+    KERNEL1x8_2 49, 0
+    KERNEL1x8_2 50, 0
+    KERNEL1x8_2 51, 0
+    KERNEL1x8_2 52, 0
+    KERNEL1x8_2 53, 0
+    KERNEL1x8_2 54, 0
+    KERNEL1x8_2 55, 0
+    KERNEL1x8_2 56, 0
+    KERNEL1x8_2 57, 0
+    KERNEL1x8_2 58, 0
+    KERNEL1x8_2 59, 0
+    KERNEL1x8_2 60, 0
+    KERNEL1x8_2 61, 0
+    KERNEL1x8_2 62, 0
+    KERNEL1x8_2 63, 1
+    bdnz    ZGEMM_L1x8_LOOP
+    MY_ALIGN  
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL1x8_2 0, 1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_2 0, 0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_2 1, 0
+    KERNEL1x4_2 2, 0
+    KERNEL1x4_2 3, 0
+    KERNEL1x4_2 4, 0
+    KERNEL1x4_2 5, 0
+    KERNEL1x4_2 6, 0
+    KERNEL1x4_2 7, 0
+    KERNEL1x4_2 8, 0
+    KERNEL1x4_2 9, 0
+    KERNEL1x4_2 10, 0
+    KERNEL1x4_2 11, 0
+    KERNEL1x4_2 12, 0
+    KERNEL1x4_2 13, 0
+    KERNEL1x4_2 14, 0
+    KERNEL1x4_2 15, 1
+    bdnz    ZGEMM_L1x4_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL1x4_2 0, 1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_2 0, 0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_2 1, 0
+    KERNEL1x2_2 2, 0
+    KERNEL1x2_2 3, 0
+    KERNEL1x2_2 4, 0
+    KERNEL1x2_2 5, 0
+    KERNEL1x2_2 6, 0
+    KERNEL1x2_2 7, 0
+    KERNEL1x2_2 8, 0
+    KERNEL1x2_2 9, 0
+    KERNEL1x2_2 10, 0
+    KERNEL1x2_2 11, 0
+    KERNEL1x2_2 12, 0
+    KERNEL1x2_2 13, 0
+    KERNEL1x2_2 14, 0
+    KERNEL1x2_2 15, 1
+    bdnz    ZGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    KERNEL1x2_2 0, 1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32, 32, 0, 0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32, 32, 1, 0  
+    KERNEL1x1_L2 32, 32, 2, 0
+    KERNEL1x1_L2 32, 32, 3, 0  
+    KERNEL1x1_L2 32, 32, 4, 0
+    KERNEL1x1_L2 32, 32, 5, 0 
+    KERNEL1x1_L2 32, 32, 6, 0
+    KERNEL1x1_L2 32, 32, 7, 0
+    KERNEL1x1_L2 32, 32, 8, 0
+    KERNEL1x1_L2 32, 32, 9, 0  
+    KERNEL1x1_L2 32, 32, 10, 0
+    KERNEL1x1_L2 32, 32, 11, 0  
+    KERNEL1x1_L2 32, 32, 12, 0
+    KERNEL1x1_L2 32, 32, 13, 0 
+    KERNEL1x1_L2 32, 32, 14, 0
+    KERNEL1x1_L2 32, 32, 15, 1   
+    bdnz    ZGEMM_L1x1_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+    MY_ALIGN
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/   
+    andi.   T1, N, 1
+    ble   ZGEMM_L1_END
+		
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+   
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I, M, 3
+    ble   ZGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 8, 1
+#else    
+    mr    BO, B  
+    dcbt    B, r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    KERNEL1x8_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L1x8_SUB0
+    bl ZGEMM_L1x8_LMAIN_SUB
+    andi.   L, T1, 127
+    ble   ZGEMM_L1x8_SAVE
+    b   ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 255
+    cmpwi   T6, 129
+#else   
+    andi.   L, K, 255
+    cmpwi   K, 129
+#endif       
+    li T8, 1
+    bne CMP1x8_128K
+    LOAD_END_1x8 -128, -16
+    mtctr   T8    
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 128
+#else    
+    cmpwi   K, 128
+#endif        
+    bne ZGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -32
+    addi AO,AO, -256   
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L1x8_SUB2_32
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_2 0, 0
+    KERNEL1x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL1x8_2 2, 0
+    KERNEL1x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_2 4, 0
+    KERNEL1x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL1x8_2 6, 0
+    KERNEL1x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_2 8, 0
+    KERNEL1x8_2 9, 0
+    KERNEL1x8_2 10, 0
+    KERNEL1x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL1x8_2 12, 0
+    KERNEL1x8_2 13, 0
+    KERNEL1x8_2 14, 0
+    KERNEL1x8_2 15, 0
+    KERNEL1x8_2 16, 0
+    KERNEL1x8_2 17, 0
+    KERNEL1x8_2 18, 0
+    KERNEL1x8_2 19, 0
+    KERNEL1x8_2 20, 0
+    KERNEL1x8_2 21, 0
+    KERNEL1x8_2 22, 0
+    KERNEL1x8_2 23, 0
+    KERNEL1x8_2 24, 0
+    KERNEL1x8_2 25, 0
+    KERNEL1x8_2 26, 0
+    KERNEL1x8_2 27, 0
+    KERNEL1x8_2 28, 0
+    KERNEL1x8_2 29, 0
+    KERNEL1x8_2 30, 0
+    KERNEL1x8_2 31, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L1x8_SUB2_16    
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_2 0, 0
+    KERNEL1x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL1x8_2 2, 0
+    KERNEL1x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_2 4, 0
+    KERNEL1x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL1x8_2 6, 0
+    KERNEL1x8_2 7, 0
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_2 8, 0
+    KERNEL1x8_2 9, 0
+    KERNEL1x8_2 10, 0
+    KERNEL1x8_2 11, 0
+    dcbt    BO, T4
+    KERNEL1x8_2 12, 0
+    KERNEL1x8_2 13, 0
+    KERNEL1x8_2 14, 0
+    KERNEL1x8_2 15, 1
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x8_SUB2_8
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_2 0, 0
+    KERNEL1x8_2 1, 0
+    dcbt    AO, T2  
+    KERNEL1x8_2 2, 0
+    KERNEL1x8_2 3, 0
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_2 4, 0
+    KERNEL1x8_2 5, 0
+    dcbt    AO, T4  
+    KERNEL1x8_2 6, 0
+    KERNEL1x8_2 7, 1
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x8_SUB2_4
+    KERNEL1x8_2 0, 0
+    KERNEL1x8_2 1, 0
+    KERNEL1x8_2 2, 0
+    KERNEL1x8_2 3, 1
+    MY_ALIGN   
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x8_SUB2_2
+    KERNEL1x8_2 0, 0
+    KERNEL1x8_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x8_SUB2_1
+    KERNEL1x8_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x8_SAVE 
+    LOAD_END_1x8 128, 16
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I, I, -1
+    KERNEL1x8_UNPRIME_MMA
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1
+#endif     
+    bgt   ZGEMM_L1x8_BEGIN
+    andi.   T2, M, 7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M, 4
+    ble   ZGEMM_L1x4_END
+    b   ZGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M, 7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M, 4
+    ble   ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 4, 1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    KERNEL1x4_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L1x4_SUB0 
+    bl ZGEMM_1x4_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L1x4_SAVE
+    b   ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP1x4_32K
+    LOAD_END_1x4 -64, -16 
+    mtctr   T8    
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -32
+    addi AO,AO, -128   
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x4_SUB2_8
+    KERNEL1x4_2 0, 0
+    KERNEL1x4_2 1, 0
+    KERNEL1x4_2 2, 0
+    KERNEL1x4_2 3, 0
+    KERNEL1x4_2 4, 0
+    KERNEL1x4_2 5, 0
+    KERNEL1x4_2 6, 0
+    KERNEL1x4_2 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x4_SUB2_4
+    KERNEL1x4_2 0, 0
+    KERNEL1x4_2 1, 0
+    KERNEL1x4_2 2, 0
+    KERNEL1x4_2 3, 1
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x4_SUB2_2
+    KERNEL1x4_2 0, 0
+    KERNEL1x4_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x4_SUB2_1
+    KERNEL1x4_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x4_SAVE 
+    LOAD_END_1x4 64,16
+
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    KERNEL1x4_UNPRIME_MMA
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1
+#endif     
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M, 2
+    ble   ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 2, 1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    KERNEL1x2_ZERO_AND_PRIME_MMA
+    ble   ZGEMM_L1x2_SUB0 
+    bl ZGEMM_1x2_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L1x2_SAVE
+    b   ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP1x2_32K
+    LOAD_END_1x2 -32, -16 
+    mtctr   T8    
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -32
+    addi AO,AO, -64   
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x2_SUB2_8
+    KERNEL1x2_2 0, 0
+    KERNEL1x2_2 1, 0
+    KERNEL1x2_2 2, 0
+    KERNEL1x2_2 3, 0
+    KERNEL1x2_2 4, 0
+    KERNEL1x2_2 5, 0
+    KERNEL1x2_2 6, 0
+    KERNEL1x2_2 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x2_SUB2_4
+    KERNEL1x2_2 0, 0
+    KERNEL1x2_2 1, 0
+    KERNEL1x2_2 2, 0
+    KERNEL1x2_2 3, 1
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x2_SUB2_2
+    KERNEL1x2_2 0, 0
+    KERNEL1x2_2 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x2_SUB2_1
+    KERNEL1x2_2 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x2_SAVE 
+    LOAD_END_1x2 32,16
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    KERNEL1x2_UNPRIME_MMA
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1
+#endif     
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M, 1
+    ble   ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO, BO,TEMP_REG, B, 1, 1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x1
+    ble   ZGEMM_L1x1_SUB0 
+    bl ZGEMM_1x1_LMAIN_SUB
+    andi.   L, T1, 31
+    ble   ZGEMM_L1x1_SAVE
+    b   ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L, T6, 63
+    cmpwi   T6, 33
+#else   
+    andi.   L, K, 63
+    cmpwi   K, 33
+#endif       
+    li T8, 1
+    bne CMP1x1_32K
+    addi BO, BO, -16
+    addi AO,AO, -16  
+    LOAD1x1O 16, 16 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  32, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6, 32
+#else    
+    cmpwi   K, 32
+#endif        
+    bne ZGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO, BO, -32
+    addi AO,AO, -32   
+    LOAD1x1_2O 32, 32
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE 
+    MY_ALIGN 
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1, L, 16
+    ble ZGEMM_L1x1_SUB2_8
+    LOAD1x1_2
+    KERNEL1x1_L2 32, 32, 0, 0
+    KERNEL1x1_L2 32, 32, 1, 0  
+    KERNEL1x1_L2 32, 32, 2, 0
+    KERNEL1x1_L2 32, 32, 3, 0  
+    KERNEL1x1_L2 32, 32, 4, 0
+    KERNEL1x1_L2 32, 32, 5, 0 
+    KERNEL1x1_L2 32, 32, 6, 0
+    KERNEL1x1_E2 32, 32, 7, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1, L, 8
+    ble ZGEMM_L1x1_SUB2_4
+    LOAD1x1_2
+    KERNEL1x1_L2 32, 32, 0, 0
+    KERNEL1x1_L2 32, 32, 1, 0  
+    KERNEL1x1_L2 32, 32, 2, 0
+    KERNEL1x1_E2 32, 32, 3, 1  
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  32, 32, 0, 0
+    KERNEL1x1_E2  32, 32, 1, 1
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  32, 32, 0, 1
+    MY_ALIGN    
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/   
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1
+#endif   
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/   
diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S
new file mode 100644
index 000000000..42f9c5ad4
--- /dev/null
+++ b/kernel/power/zgemm_macros_power10.S
@@ -0,0 +1,1138 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+/*	HELPERS FOR SAVE	*/
+/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
+#ifndef TRMMKERNEL 
+  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
+  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif	
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+.endm 
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image instead  instead to fix sign*/
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm 
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL  
+	xvmsubadp \VSOUT1,\VSINII, alpha_i
+	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
+#else 
+	xvmuldp \VSOUT1,\VSINII, alpha_i 
+	xvmuldp  \VSOUT2,\VSINRR, alpha_i
+#endif 
+.endm
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
+	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
+	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
+  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
+  LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
+  LOAD_COUPLE_AS_RR_II	vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
+  LOAD_COUPLE_AS_RR_II	vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
+  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs44,vs45 
+  AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+  MULT_APLHA_PART1	vs34,vs36, vs46,vs47
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
+  MULT_APLHA_PART1	vs38,vs40,vs48,vs49
+  MULT_APLHA_PART2  vs34,vs36,vs46,vs47 
+  AGGREGATE_REALS_IMAGES	vs42,vs43,vs44,vs45
+  MULT_APLHA_PART2	vs38,vs40,vs48,vs49
+  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
+  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
+  MULT_APLHA_PART1	vs42,vs44, vs56,vs57
+  UNPACK_FOR_STORE	vs48,vs49,vs35,vs37 
+  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs58,vs59
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
+  MULT_APLHA_PART2	vs42,vs44,vs56,vs57
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37 
+  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs58,vs59
+  UNPACK_FOR_STORE	vs56,vs57,vs42,vs44
+  UNPACK_FOR_STORE	vs58,vs59,\VSRes1,\VSRes3
+  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs42,vs44
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
+  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
+  LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
+  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
+  AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
+  MULT_APLHA_PART1	vs34,vs36, vs46,vs47
+  MULT_APLHA_PART1	vs38,vs40, vs48,vs49
+  MULT_APLHA_PART2	vs34,vs36, vs46,vs47 
+  MULT_APLHA_PART2	vs38,vs40,vs48,vs49
+  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
+  UNPACK_FOR_STORE	vs48,vs49,vs35,vs37
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37
+.endm
+
+
+.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
+  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37	
+  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
+  MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
+  MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
+  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41	
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41  
+.endm
+
+
+.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
+#ifndef TRMMKERNEL 
+  lxv	vs50,	(\LOFFSET)(\BASE_REG) 
+  xxmrgld  vs46,vs50,vs50
+  xxmrghd  vs47,vs50,vs50	
+#endif	
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs36,vs37	
+  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
+  MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
+  MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
+  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41 
+  xxmrghd  vs39,vs47,vs46	
+  stxv	vs39,	(\LOFFSET)(\BASE_REG) 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro  KERNEL2x8_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+        xxsetaccz 1
+        xxsetaccz 2
+        xxsetaccz 3
+        xxsetaccz 4
+        xxsetaccz 5
+        xxsetaccz 6
+        xxsetaccz 7
+.endm
+
+
+.macro KERNEL2x8_PRELOAD
+	lxvp	vs32,	 0(AO)	// load real,imag from A
+	lxvp	vs34,	32(AO)	// load real,imag from A
+	lxvp	vs36,	64(AO)	// load real,imag from A
+	lxvp	vs38,	96(AO)	// load real,imag from A
+	lxvp	vs48,	 0(BO)	// load real imag from B
+.endm
+
+
+.macro KERNEL2x8_2	Index, IsLast
+	lxvp	vs40,	DISP16(\Index,128)(AO)	// load real,imag from A
+	lxvp	vs42,	DISP16(\Index,160)(AO)	// load real,imag from A
+	lxvp	vs44,	DISP16(\Index,192)(AO)	// load real,imag from A
+	lxvp	vs46,	DISP16(\Index,224)(AO)	// load real,imag from A
+ 	lxvp	vs50,	DISP4(\Index,  32)(BO)	// load real,imag from B
+	xvf64gerpp	0,	vs32,	vs49
+	xvf64gerpp	1,	vs34,	vs49
+	xvf64gerpp	2,	vs36,	vs49
+	xvf64gerpp	3,	vs38,	vs49
+	xvf64gerpp	4,	vs32,	vs48
+	xvf64gerpp	5,	vs34,	vs48
+	xvf64gerpp	6,	vs36,	vs48
+	xvf64gerpp	7,	vs38,	vs48
+	lxvp	vs32,	DISP16(\Index, 256)(AO)	// load real,imag from A
+	lxvp	vs34,	DISP16(\Index, 288)(AO)	// load real,imag from A
+	lxvp	vs36,	DISP16(\Index, 320)(AO)	// load real,imag from A
+	lxvp	vs38,	DISP16(\Index, 352)(AO)	// load real,imag from A
+	lxvp	vs48,	DISP4(\Index,  64)(BO)	// load real imag from B
+	xvf64gerpp	0,	vs40,	vs51
+	xvf64gerpp	1,	vs42,	vs51
+	xvf64gerpp	2,	vs44,	vs51
+	xvf64gerpp	3,	vs46,	vs51
+	xvf64gerpp	4,	vs40,	vs50
+	xvf64gerpp	5,	vs42,	vs50
+	xvf64gerpp	6,	vs44,	vs50
+	xvf64gerpp	7,	vs46,	vs50
+.if \IsLast==1
+	addi	AO, AO,  DISP16(\Index,256)
+	addi	BO, BO,  DISP4(\Index,64)
+.endif 
+.endm
+
+
+.macro LOAD_END_2x8  OffsetA,OffsetB
+	xvf64gerpp	0,	vs32,	vs49
+	xvf64gerpp	1,	vs34,	vs49
+	xvf64gerpp	2,	vs36,	vs49
+	xvf64gerpp	3,	vs38,	vs49
+	xvf64gerpp	4,	vs32,	vs48
+	xvf64gerpp	5,	vs34,	vs48
+	xvf64gerpp	6,	vs36,	vs48
+	xvf64gerpp	7,	vs38,	vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL2x8_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+        xxmfacc 1
+        xxmfacc 2
+        xxmfacc 3
+        xxmfacc 4
+        xxmfacc 5
+        xxmfacc 6
+        xxmfacc 7
+.endm
+
+
+.macro SAVE2x8
+	add	T1, CO ,LDC 
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+        xxpermdi vs36, vs4, vs5, 0b01
+        xxpermdi vs37, vs4, vs5, 0b10
+        xxpermdi vs38, vs6, vs7, 0b01
+        xxpermdi vs39, vs6, vs7, 0b10
+        xxpermdi vs40, vs8, vs9, 0b01
+        xxpermdi vs41, vs8, vs9, 0b10
+        xxpermdi vs42, vs10, vs11, 0b01
+        xxpermdi vs43, vs10, vs11, 0b10
+        xxpermdi vs44, vs12, vs13, 0b01
+        xxpermdi vs45, vs12, vs13, 0b10
+        xxpermdi vs46, vs14, vs15, 0b01
+        xxpermdi vs47, vs14, vs15, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+	xxlor vs6, vs36, vs36
+	xxlor vs7, vs37, vs37
+	xxlor vs4, vs38, vs38
+	xxlor vs5, vs39, vs39
+	xxlor vs10, vs40, vs40
+	xxlor vs11, vs41, vs41
+	xxlor vs8, vs42, vs42
+	xxlor vs9, vs43, vs43
+	xxlor vs14, vs44, vs44
+	xxlor vs15, vs45, vs45
+	xxlor vs12, vs46, vs46
+	xxlor vs13, vs47, vs47
+
+        xxpermdi vs32, vs16, vs17, 0b01
+        xxpermdi vs33, vs16, vs17, 0b10
+        xxpermdi vs34, vs18, vs19, 0b01
+        xxpermdi vs35, vs18, vs19, 0b10
+        xxpermdi vs36, vs20, vs21, 0b01
+        xxpermdi vs37, vs20, vs21, 0b10
+        xxpermdi vs38, vs22, vs23, 0b01
+        xxpermdi vs39, vs22, vs23, 0b10
+        xxpermdi vs40, vs24, vs25, 0b01
+        xxpermdi vs41, vs24, vs25, 0b10
+        xxpermdi vs42, vs26, vs27, 0b01
+        xxpermdi vs43, vs26, vs27, 0b10
+        xxpermdi vs44, vs28, vs29, 0b01
+        xxpermdi vs45, vs28, vs29, 0b10
+        xxpermdi vs46, vs30, vs31, 0b01
+        xxpermdi vs47, vs30, vs31, 0b10
+       
+	xxlor vs18, vs32, vs32
+	xxlor vs19, vs33, vs33
+	xxlor vs16, vs34, vs34
+	xxlor vs17, vs35, vs35
+	xxlor vs22, vs36, vs36
+	xxlor vs23, vs37, vs37
+	xxlor vs20, vs38, vs38
+	xxlor vs21, vs39, vs39
+	xxlor vs26, vs40, vs40
+	xxlor vs27, vs41, vs41
+	xxlor vs24, vs42, vs42
+	xxlor vs25, vs43, vs43
+	xxlor vs30, vs44, vs44
+	xxlor vs31, vs45, vs45
+	xxlor vs28, vs46, vs46
+	xxlor vs29, vs47, vs47
+
+	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
+	SAVE8  vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0  
+	addi	CO, CO, 128
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro  KERNEL2x4_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+        xxsetaccz 1
+        xxsetaccz 2
+        xxsetaccz 3
+.endm
+
+
+.macro KERNEL2x4_PRELOAD
+	lxvp	vs32,	 0(AO)	// load real,imag from A
+	lxvp	vs34,	32(AO)	// load real,imag from A
+	lxvp	vs48,	 0(BO)	// load real imag from B
+.endm
+
+
+.macro KERNEL2x4_2 Index, IsLast
+	lxvp	vs40,	DISP8(\Index,  64)(AO)	// load real,imag from A
+	lxvp	vs42,	DISP8(\Index,  96)(AO)	// load real,imag from A
+ 	lxvp	vs50,	DISP4(\Index,  32)(BO)  // load real,imag from B
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs34,   vs49
+        xvf64gerpp      2,      vs32,   vs48
+        xvf64gerpp      3,      vs34,   vs48
+	lxvp	vs32,	DISP8(\Index, 128)(AO)	// load real,imag from A
+	lxvp	vs34,	DISP8(\Index, 160)(AO)	// load real,imag from A
+ 	lxvp	vs48,	DISP4(\Index,  64)(BO)  // load real,imag from B
+        xvf64gerpp      0,      vs40,   vs51 
+        xvf64gerpp      1,      vs42,   vs51
+        xvf64gerpp      2,      vs40,   vs50
+        xvf64gerpp      3,      vs42,   vs50
+.if \IsLast==1
+	addi	AO, AO, DISP8(\Index,128)
+	addi	BO, BO, DISP4(\Index,64)
+.endif 
+.endm
+ 
+
+.macro LOAD_END_2x4	OffsetA, OffsetB
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs34,   vs49
+        xvf64gerpp      2,      vs32,   vs48
+        xvf64gerpp      3,      vs34,   vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL2x4_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+        xxmfacc 1
+        xxmfacc 2
+        xxmfacc 3
+.endm
+
+
+.macro SAVE2x4 
+	add	T1, CO ,LDC 
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+        xxpermdi vs36, vs4, vs5, 0b01
+        xxpermdi vs37, vs4, vs5, 0b10
+        xxpermdi vs38, vs6, vs7, 0b01
+        xxpermdi vs39, vs6, vs7, 0b10
+        xxpermdi vs40, vs8, vs9, 0b01
+        xxpermdi vs41, vs8, vs9, 0b10
+        xxpermdi vs42, vs10, vs11, 0b01
+        xxpermdi vs43, vs10, vs11, 0b10
+        xxpermdi vs44, vs12, vs13, 0b01
+        xxpermdi vs45, vs12, vs13, 0b10
+        xxpermdi vs46, vs14, vs15, 0b01
+        xxpermdi vs47, vs14, vs15, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+	xxlor vs6, vs36, vs36
+	xxlor vs7, vs37, vs37
+	xxlor vs4, vs38, vs38
+	xxlor vs5, vs39, vs39
+	xxlor vs10, vs40, vs40
+	xxlor vs11, vs41, vs41
+	xxlor vs8, vs42, vs42
+	xxlor vs9, vs43, vs43
+	xxlor vs14, vs44, vs44
+	xxlor vs15, vs45, vs45
+	xxlor vs12, vs46, vs46
+	xxlor vs13, vs47, vs47
+
+	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
+	SAVE4  vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0  
+	addi	CO, CO, 64
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro  KERNEL2x2_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+        xxsetaccz 1
+.endm
+
+
+.macro KERNEL2x2_PRELOAD
+	lxvp	vs32,	 0(AO)	// load real,imag from A
+	lxvp	vs48,	 0(BO)	// load real imag from B
+.endm
+
+
+.macro KERNEL2x2_2 Index, IsLast
+	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
+ 	lxvp	vs50,	DISP4(\Index, 32)(BO)	// load real,imag from B
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs32,   vs48
+	lxvp	vs32,	DISP4(\Index, 64)(AO)	// load real,imag from A
+	lxvp	vs48,	DISP4(\Index, 64)(BO)	// load real imag from B
+        xvf64gerpp      0,      vs40,   vs51
+        xvf64gerpp      1,      vs40,   vs50
+.if \IsLast==1
+	addi	AO, AO, DISP4(\Index,64)
+	addi	BO, BO, DISP4(\Index,64)
+.endif 
+.endm
+
+ 
+.macro LOAD_END_2x2  OffsetA,OffsetB
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs32,   vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL2x2_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+        xxmfacc 1
+.endm
+
+
+.macro SAVE2x2 
+	add	T1, CO ,LDC 
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+        xxpermdi vs36, vs4, vs5, 0b01
+        xxpermdi vs37, vs4, vs5, 0b10
+        xxpermdi vs38, vs6, vs7, 0b01
+        xxpermdi vs39, vs6, vs7, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+	xxlor vs6, vs36, vs36
+	xxlor vs7, vs37, vs37
+	xxlor vs4, vs38, vs38
+	xxlor vs5, vs39, vs39
+
+	SAVE2  vs0,vs1,vs2,vs3,CO,0
+	SAVE2  vs4,vs5,vs6,vs7,T1,0 
+	addi	CO, CO, 32 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro ZERO2x1
+	xxlxor	vs0,	vs0,	vs0
+	xxlxor	vs1,	vs1,	vs1
+	xxlxor	vs2,	vs2,	vs2
+	xxlxor	vs3,	vs3,	vs3
+ 
+.endm
+
+
+.macro LOAD2x1   
+	LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs49, vs48
+	xxswapd	vs51, vs50
+	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+	END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs0,	vs32,	vs48
+	xvmaddadp	vs2,	vs32,	vs50
+	xvmaddadp	vs1,	vs32,	vs49
+	xvmaddadp	vs3,	vs32,	vs51 
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm	
+
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs52,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs54,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs49, vs48
+	xxswapd	vs51, vs50
+	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END2x1_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs53, vs52
+  xxswapd	vs55, vs54 
+	xvmaddadp	vs0,	vs32,	vs48
+	xvmaddadp	vs2,	vs32,	vs50
+	xvmaddadp	vs1,	vs32,	vs49
+	xvmaddadp	vs3,	vs32,	vs51
+.if \Complete==0	
+	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs48,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs50,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs49, vs48
+  xxswapd	vs51, vs50
+.endif 
+	xvmaddadp	vs0,	vs40,	vs52
+	xvmaddadp	vs2,	vs40,	vs54 
+	xvmaddadp	vs1,	vs40,	vs53
+	xvmaddadp	vs3,	vs40,	vs55
+.if \Complete==0		
+	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs52,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs54,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 16,32
+.endm
+
+
+.macro SAVE2x1
+	add	T1, CO ,LDC 
+	SAVE1  vs0,vs1,CO,0
+	SAVE1  vs2,vs3,T1,0  
+	addi	CO, CO, 16 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro  KERNEL1x8_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+        xxsetaccz 1
+        xxsetaccz 2
+        xxsetaccz 3
+.endm
+
+
+.macro KERNEL1x8_2	Index,IsLast
+	lxvp	vs32,	DISP16(\Index,   0)(AO)	// load real,imag from A
+	lxvp	vs34,	DISP16(\Index,  32)(AO)	// load real,imag from A
+	lxvp	vs36,	DISP16(\Index,  64)(AO)	// load real,imag from A
+	lxvp	vs38,	DISP16(\Index,  96)(AO)	// load real,imag from A
+	lxvp	vs40,	DISP16(\Index, 128)(AO)	// load real,imag from A
+	lxvp	vs42,	DISP16(\Index, 160)(AO)	// load real,imag from A
+	lxvp	vs44,	DISP16(\Index, 192)(AO)	// load real,imag from A
+	lxvp	vs46,	DISP16(\Index, 224)(AO)	// load real,imag from A
+	lxvp	vs48,	DISP2(\Index,    0)(BO)	// load real imag from B
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs34,   vs49
+        xvf64gerpp      2,      vs36,   vs49
+        xvf64gerpp      3,      vs38,   vs49
+        xvf64gerpp      0,      vs40,   vs48
+        xvf64gerpp      1,      vs42,   vs48
+        xvf64gerpp      2,      vs44,   vs48
+        xvf64gerpp      3,      vs46,   vs48
+.if \IsLast==1
+	addi	AO, AO, DISP16(\Index,256)
+	addi	BO, BO,  DISP2(\Index,32)
+.endif 
+.endm
+
+
+.macro LOAD_END_1x8  OffsetA,OffsetB
+	lxvp	vs32,	0(AO)	// load real,imag from A
+	lxvp	vs34,	32(AO)	// load real,imag from A
+	lxvp	vs36,	64(AO)	// load real,imag from A
+	lxvp	vs38,	96(AO)	// load real,imag from A
+	lxv	vs48,	0(BO)	// load real imag from B 
+        xvf64gerpp      0,      vs32,   vs48
+        xvf64gerpp      1,      vs34,   vs48
+        xvf64gerpp      2,      vs36,   vs48
+        xvf64gerpp      3,      vs38,   vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL1x8_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+        xxmfacc 1
+        xxmfacc 2
+        xxmfacc 3
+.endm
+
+
+.macro SAVE1x8
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+        xxpermdi vs36, vs4, vs5, 0b01
+        xxpermdi vs37, vs4, vs5, 0b10
+        xxpermdi vs38, vs6, vs7, 0b01
+        xxpermdi vs39, vs6, vs7, 0b10
+        xxpermdi vs40, vs8, vs9, 0b01
+        xxpermdi vs41, vs8, vs9, 0b10
+        xxpermdi vs42, vs10, vs11, 0b01
+        xxpermdi vs43, vs10, vs11, 0b10
+        xxpermdi vs44, vs12, vs13, 0b01
+        xxpermdi vs45, vs12, vs13, 0b10
+        xxpermdi vs46, vs14, vs15, 0b01
+        xxpermdi vs47, vs14, vs15, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+	xxlor vs6, vs36, vs36
+	xxlor vs7, vs37, vs37
+	xxlor vs4, vs38, vs38
+	xxlor vs5, vs39, vs39
+	xxlor vs10, vs40, vs40
+	xxlor vs11, vs41, vs41
+	xxlor vs8, vs42, vs42
+	xxlor vs9, vs43, vs43
+	xxlor vs14, vs44, vs44
+	xxlor vs15, vs45, vs45
+	xxlor vs12, vs46, vs46
+	xxlor vs13, vs47, vs47
+
+	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
+	addi	CO, CO, 128
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro  KERNEL1x4_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+        xxsetaccz 1
+.endm
+
+
+.macro KERNEL1x4_2	Index,IsLast
+	lxvp	vs32,	DISP8(\Index,  0)(AO)	// load real,imag from A
+	lxvp	vs34,	DISP8(\Index, 32)(AO)	// load real,imag from A
+	lxvp	vs40,	DISP8(\Index, 64)(AO)	// load real,imag from A
+	lxvp	vs42,	DISP8(\Index, 96)(AO)	// load real,imag from A
+	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      1,      vs34,   vs49
+        xvf64gerpp      0,      vs40,   vs48
+        xvf64gerpp      1,      vs42,   vs48
+.if \IsLast==1
+	addi	AO, AO, DISP8(\Index,128)
+	addi	BO, BO,  DISP2(\Index,32)
+.endif 
+.endm
+ 
+
+.macro LOAD_END_1x4  OffsetA,OffsetB
+	lxvp	vs32,	0(AO)	// load real,imag from A
+	lxvp	vs34,	32(AO)	// load real,imag from A
+	lxv	vs48,	0(BO)	// load real imag from B
+        xvf64gerpp      0,      vs32,   vs48
+        xvf64gerpp      1,      vs34,   vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL1x4_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+        xxmfacc 1
+.endm
+
+
+.macro SAVE1x4 
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+        xxpermdi vs36, vs4, vs5, 0b01
+        xxpermdi vs37, vs4, vs5, 0b10
+        xxpermdi vs38, vs6, vs7, 0b01
+        xxpermdi vs39, vs6, vs7, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+	xxlor vs6, vs36, vs36
+	xxlor vs7, vs37, vs37
+	xxlor vs4, vs38, vs38
+	xxlor vs5, vs39, vs39
+
+	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
+	addi	CO, CO, 64
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro  KERNEL1x2_ZERO_AND_PRIME_MMA
+        /* zero out and prime the MMA accumulators */
+        xxsetaccz 0
+.endm
+
+
+.macro KERNEL1x2_2	Index,IsLast
+	lxvp	vs32,	DISP4(\Index,  0)(AO)	// load real,imag from A
+	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
+	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
+        xvf64gerpp      0,      vs32,   vs49
+        xvf64gerpp      0,      vs40,   vs48
+.if \IsLast==1
+	addi	AO, AO, DISP4(\Index,64)
+	addi	BO, BO, DISP2(\Index,32)
+.endif 
+.endm
+ 
+
+.macro LOAD_END_1x2  OffsetA,OffsetB
+	lxvp	vs32,	0(AO)	// load real,imag from A
+	lxv	vs48,	0(BO)	// load real imag from B
+        xvf64gerpp      0,      vs32,   vs48
+	addi	BO, BO, \OffsetB
+	addi	AO, AO, \OffsetA
+.endm
+
+
+.macro  KERNEL1x2_UNPRIME_MMA
+        /* "unprime" MMA accumulators */
+        xxmfacc 0
+.endm
+
+
+.macro SAVE1x2 
+        xxpermdi vs32, vs0, vs1, 0b01
+        xxpermdi vs33, vs0, vs1, 0b10
+        xxpermdi vs34, vs2, vs3, 0b01
+        xxpermdi vs35, vs2, vs3, 0b10
+
+	xxlor vs2, vs32, vs32
+	xxlor vs3, vs33, vs33
+	xxlor vs0, vs34, vs34
+	xxlor vs1, vs35, vs35
+
+	SAVE2  vs0,vs1,vs2,vs3,CO,0
+	addi	CO, CO, 32 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro ZERO1x1
+	xxlxor	vs0,	vs0,	vs0
+	xxlxor	vs1,	vs1,	vs1 
+.endm
+
+
+.macro LOAD1x1   
+	LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
+	xxswapd	vs49, vs48
+
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+	END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs0,	vs32,	vs48 
+	xvmaddadp	vs1,	vs32,	vs49 
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm	
+
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs52,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs49, vs48
+
+	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x1_2	  
+  /*for load2 offset will be 32 and 32*/
+   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs53, vs52
+	xvmaddadp	vs0,	vs32,	vs48 
+	xvmaddadp	vs1,	vs32,	vs49 
+.if \Complete==0	
+	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs48,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs49, vs48
+.endif 
+	xvmaddadp	vs0,	vs40,	vs52
+	xvmaddadp	vs1,	vs40,	vs53 
+.if \Complete==0		
+	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs52,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+	SAVE1  vs0,vs1,CO,0
+	addi	CO, CO, 16 
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	8			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	7			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	6			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	4			 
+		.endif
+.endm
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+    #endif
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
+
diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c
index ba019d6a5..1f7199c89 100644
--- a/kernel/power/zgemv_n_4.c
+++ b/kernel/power/zgemv_n_4.c
@@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include "common.h"
 
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 #define HAVE_KERNEL_4x4_VEC 1
 #define HAVE_KERNEL_4x2_VEC 1
 #define HAVE_KERNEL_4x1_VEC 1
@@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
 #include <altivec.h> 
 #endif
+#endif
 
 // 
 #define NBMAX 4096
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index b34199af6..4ed27d96b 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #define NBMAX 4096
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
 #define HAVE_KERNEL_4x4_VEC 1
 #define HAVE_KERNEL_4x2_VEC 1
 #define HAVE_KERNEL_4x1_VEC 1
 
+#endif
 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
 #include <altivec.h> 
 #endif
diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c
index c6d666178..5e7ca3b23 100644
--- a/kernel/power/zrot.c
+++ b/kernel/power/zrot.c
@@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zrot.c"
+#else
 
 #include "common.h"
 
@@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
  
 }
 
- 
\ No newline at end of file
+#endif 
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index a1b441d2c..5526f4d67 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -38,11 +38,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_8
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
index 1d8826f41..3a5a8eb83 100644
--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9)
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
 #include "zswap_microk_power8.c"
 #endif
+#endif
 
 
 #ifndef HAVE_KERNEL_16
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index b7cf0f112..582a1dc01 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -53,6 +53,7 @@ gotoblas_t TABLE_NAME = {
 
   GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
 
+#ifdef BUILD_HALF
   0, 0, 0,
   SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
 #ifdef SHGEMM_DEFAULT_UNROLL_MN
@@ -109,7 +110,7 @@ gotoblas_t TABLE_NAME = {
 #else
   NULL,NULL,
 #endif
-
+#endif
 
   0, 0, 0,
   SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
@@ -134,6 +135,11 @@ gotoblas_t TABLE_NAME = {
   sgemv_nTS,  sgemv_tTS, sger_kTS,
   ssymv_LTS, ssymv_UTS,
 
+#ifdef ARCH_X86_64
+  sgemm_directTS,
+  sgemm_direct_performantTS,	
+#endif
+	
   sgemm_kernelTS, sgemm_betaTS,
 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
   sgemm_incopyTS, sgemm_itcopyTS,
@@ -706,19 +712,25 @@ gotoblas_t TABLE_NAME = {
 
 #if defined(ARCH_ARM64)
 static void init_parameter(void) {
+#if defined(BUILD_HALF)
   TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
+#endif
   TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
   TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
   TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
   TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
 
+#if defined(BUILD_HALF)
   TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
+#endif
   TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
   TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
   TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
   TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
 
+#if defined(BUILD_HALF)
   TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
+#endif
   TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
   TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
   TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
@@ -782,20 +794,26 @@ static void init_parameter(void) {
 #if defined(ARCH_POWER)
 static void init_parameter(void) {
 
+#ifdef BUILD_HALF
   TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
+#endif
   TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
   TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
   TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
   TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
 
+#ifdef BUILD_HALF
   TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
+#endif
   TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
   TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
   TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
   TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
 
 
+#ifdef BUILD_HALF
   TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
+#endif
   TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
   TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
   TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
@@ -805,20 +823,26 @@ static void init_parameter(void) {
 
 #if defined(ARCH_ZARCH)
 static void init_parameter(void) {
+#ifdef BUILD_HALF
 	TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
+#endif
 	TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
 	TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
 	TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
 	TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
 
+#ifdef BUILD_HALF
 	TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
+#endif
 	TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
 	TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
 	TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
 	TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
 
 
+#ifdef BUILD_HALF
 	TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
+#endif
 	TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
 	TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
 	TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
@@ -958,9 +982,11 @@ static void init_parameter(void) {
   (void) l2; /* dirty trick to suppress unused variable warning for targets */
              /* where the GEMM unrolling parameters do not depend on l2 */
   
+#ifdef BUILD_HALF
   TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
   TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
   TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
+#endif
   TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
   TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
   TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
@@ -1145,7 +1171,7 @@ static void init_parameter(void) {
 #endif
 #endif
 
-#ifdef SKYLAKEX
+#if defined (SKYLAKEX) || defined (COOPERLAKE)
 
 #ifdef DEBUG
   fprintf(stderr, "SkylakeX\n");
diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
index 34653d400..fde9eba8e 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
index 492f34344..fddf7560f 100644
--- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
index 6840c54ad..33afd2a61 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
index e2f731fca..b05bd6ee5 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
index 11825429e..f960559a6 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
index 4c054f399..cf842c9b5 100644
--- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
index e67496736..63c44c27a 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
index 498057697..4cb01e50a 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
index f3072983d..09d5d8e43 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
index 879ae9c38..7d129e54c 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
index 6c308197b..d33599317 100644
--- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
new file mode 100644
index 000000000..0b2f3c0ed
--- /dev/null
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -0,0 +1 @@
+include $(KERNELDIR)/KERNEL.SKYLAKEX
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 65f031d03..9b8b84c30 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -14,7 +14,7 @@ STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+DGEMMITCOPY    =  dgemm_tcopy_16_skylakex.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
@@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c
 
 CGEMMKERNEL    =  cgemm_kernel_8x2_skylakex.c
 ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
+
+CSCALKERNEL    = ../arm/zscal.c
+ZSCALKERNEL    = ../arm/zscal.c
diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c
index 586d05ac2..c19b98f02 100644
--- a/kernel/x86_64/caxpy.c
+++ b/kernel/x86_64/caxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caxpy_microk_steamroller-2.c"
 #elif defined(BULLDOZER)
 #include "caxpy_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
 #include "caxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "caxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c
index 93fca0a0d..f2bf19dcd 100644
--- a/kernel/x86_64/cdot.c
+++ b/kernel/x86_64/cdot.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER)  || defined(EXCAVATOR)
 #include "cdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "cdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "cdot_microk_sandy-2.c"
@@ -141,8 +141,8 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i=0;
 		ix=0;
 		iy=0;
-		inc_x <<= 1;
-		inc_y <<= 1;
+		inc_x *= 2;
+		inc_y *= 2;
 		while(i < n)
 		{
 
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index d81766cd4..0ed02b8d8 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "cgemv_n_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "cgemv_n_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index 6bdea6787..c2903b11f 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "cgemv_t_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "cgemv_t_microk_bulldozer-4.c"
@@ -233,9 +233,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
 
-        inc_x <<= 1;
-        inc_y <<= 1;
-        lda   <<= 1;
+        inc_x *= 2;
+        inc_y *= 2;
+        lda   *= 2;
 	lda4    = lda << 2;
 
 	xbuffer = buffer;
diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index 72af99809..6d75358a6 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "cscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "cscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c
index cde5bdaa6..d84c0c221 100644
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "daxpy_microk_piledriver-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "daxpy_microk_haswell-2.c"
-#elif defined (SKYLAKEX)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "daxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "daxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index 969357614..e4b6622e6 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ddot_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "ddot_microk_haswell-2.c"
-#elif defined (SKYLAKEX)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "ddot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ddot_microk_sandy-2.c"
diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
index 416ace59b..9f2bf24e2 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -54,40 +54,40 @@
     #define kernel_kstart_n10(mdim,updk) ""
     #define kernel_kstart_n12(mdim,updk) ""
     #define kernel_kend_n4(mdim) "xorq %3,%3;"\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24)
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16)
     #define kernel_kend_n6(mdim) "xorq %3,%3;"\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56)
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48)
     #define kernel_kend_n8(mdim) "xorq %3,%3;"\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88)
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80)
     #define kernel_kend_n10(mdim) "xorq %3,%3;"\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\
-      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\
-      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120)
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112)
     #define kernel_kend_n12(mdim) "xorq %3,%3;"\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\
-      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\
-      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\
-      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\
-      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\
-      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\
-      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\
-      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152)
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\
+      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\
+      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144)
   #endif
 #else
   #define HEAD_SET_OFF(ndim) {}
@@ -129,18 +129,28 @@
   #define init_update_k(mdim) ""
   #define save_update_k(mdim) ""
 #endif
-    
+
 #define KERNEL_h_k1m16n1 \
   "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
   "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
 #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
-#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
+#ifdef BROADCAST_KERNEL
+ #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
   "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
-#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
-#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\
+ #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
   "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
-  "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
-#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__)
+  "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
+ #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
+#else
+ #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
+  "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\
+  "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";"
+ #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
+ #define KERNEL_h_k1m16n2 \
+  "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\
+  unit_acc_m16n2(8,9,10,11,%1)
+#endif
+#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
 #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
 #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
 #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
@@ -151,24 +161,42 @@
 #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
 #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+ #ifdef BROADCAST_KERNEL
   #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
-  #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1)
-  #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2)
-  #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15)
-  #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1)
-  #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2)
+ #else
+  #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;"
+ #endif
+ #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1)
+ #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2)
+ #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15)
+ #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1)
+ #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2)
 #endif
 #define save_init_m16 "movq %2,%3; addq $128,%2;"
 #ifdef TRMMKERNEL
   #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
+ #ifdef BROADCAST_KERNEL
   #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
     "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
     "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
+ #else
+  #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
+    "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
+    "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
+    "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
+ #endif
 #else
   #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
+ #ifdef BROADCAST_KERNEL
   #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
     "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
     "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
+ #else
+  #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
+    "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
+    "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
+    "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
+ #endif
 #endif
 #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
 #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
@@ -206,11 +234,11 @@
 #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
-  #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
-  #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
-  #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15)
-  #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
-  #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
+  #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15)
+  #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
 #endif
 #define save_init_m8 "movq %2,%3; addq $64,%2;"
 #ifdef TRMMKERNEL
@@ -258,11 +286,11 @@
 #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
-  #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
-  #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
-  #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15)
-  #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
-  #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
+  #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15)
+  #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
 #endif
 #define save_init_m4 "movq %2,%3; addq $32,%2;"
 #ifdef TRMMKERNEL
@@ -311,11 +339,11 @@
 #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
-  #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
-  #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
-  #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15)
-  #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
-  #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
+  #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15)
+  #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
 #endif
 #define save_init_m2 "movq %2,%3; addq $16,%2;"
 #ifdef TRMMKERNEL
@@ -362,11 +390,11 @@
 #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
-  #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
-  #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
-  #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
-  #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
-  #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
+  #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
+  #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
+  #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
+  #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
+  #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
 #endif
 #define save_init_m1 "movq %2,%3; addq $8,%2;"
 #ifdef TRMMKERNEL
diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c
new file mode 100644
index 000000000..a1da60f8f
--- /dev/null
+++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c
@@ -0,0 +1,129 @@
+#include <stdio.h>
+#include "common.h"
+#include <immintrin.h>
+
+int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){
+  double *src1, *src2, *src3, *src4, *dst1;
+  __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4;
+  BLASLONG dim1_count, dim2_count, src_inc;
+  src_inc = 4 * lead_dim - dim_first;
+  src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim;
+  for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){
+    dst1 = dst + 16 * (dim_second - dim2_count);
+    for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
+      z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
+      z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16;
+      z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16;
+      z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16;
+      _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
+      _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4);
+      _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6);
+      _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second;
+    }
+    dst1 -= 8 * (dim_second - dim2_count);
+    if(dim1_count>7){
+      z1 = _mm512_loadu_pd(src1); src1 += 8;
+      z2 = _mm512_loadu_pd(src2); src2 += 8;
+      z3 = _mm512_loadu_pd(src3); src3 += 8;
+      z4 = _mm512_loadu_pd(src4); src4 += 8;
+      _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
+      _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second;
+      dim1_count -= 8;
+    }
+    dst1 -= 4 * (dim_second - dim2_count);
+    if(dim1_count>3){
+      y1 = _mm256_loadu_pd(src1); src1 += 4;
+      y2 = _mm256_loadu_pd(src2); src2 += 4;
+      y3 = _mm256_loadu_pd(src3); src3 += 4;
+      y4 = _mm256_loadu_pd(src4); src4 += 4;
+      _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2);
+      _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second;
+      dim1_count -= 4;
+    }
+    dst1 -= 2 * (dim_second - dim2_count);
+    if(dim1_count>1){
+      x1 = _mm_loadu_pd(src1); src1 += 2;
+      x2 = _mm_loadu_pd(src2); src2 += 2;
+      x3 = _mm_loadu_pd(src3); src3 += 2;
+      x4 = _mm_loadu_pd(src4); src4 += 2;
+      _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2);
+      _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second;
+      dim1_count -= 2;
+    }
+    dst1 -= dim_second - dim2_count;
+    if(dim1_count>0){
+      s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++;
+      dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4;
+    }
+    src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc;
+  }
+  src_inc -= 2 * lead_dim;
+  for(; dim2_count>1; dim2_count-=2){
+    dst1 = dst + 16 * (dim_second - dim2_count);
+    for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
+      z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
+      z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16;
+      _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2);
+      _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second;
+    }
+    dst1 -= 8 * (dim_second - dim2_count);
+    if(dim1_count>7){
+      z1 = _mm512_loadu_pd(src1); src1 += 8;
+      z2 = _mm512_loadu_pd(src2); src2 += 8;
+      _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second;
+      dim1_count -= 8;
+    }
+    dst1 -= 4 * (dim_second - dim2_count);
+    if(dim1_count>3){
+      y1 = _mm256_loadu_pd(src1); src1 += 4;
+      y2 = _mm256_loadu_pd(src2); src2 += 4;
+      _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second;
+      dim1_count -= 4;
+    }
+    dst1 -= 2 * (dim_second - dim2_count);
+    if(dim1_count>1){
+      x1 = _mm_loadu_pd(src1); src1 += 2;
+      x2 = _mm_loadu_pd(src2); src2 += 2;
+      _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second;
+      dim1_count -= 2;
+    }
+    dst1 -= dim_second - dim2_count;
+    if(dim1_count>0){
+      s1 = *src1; src1++; s2 = *src2; src2++;
+      dst1[0] = s1; dst1[1] = s2;
+    }
+    src1 += src_inc; src2 += src_inc;
+  }
+  src_inc -= lead_dim;
+  for(; dim2_count>0; dim2_count--){
+    dst1 = dst + 16 * (dim_second - dim2_count);
+    for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){
+      z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16;
+      _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second;
+    }
+    dst1 -= 8 * (dim_second - dim2_count);
+    if(dim1_count>7){
+      z1 = _mm512_loadu_pd(src1); src1 += 8;
+      _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second;
+      dim1_count -= 8;
+    }
+    dst1 -= 4 * (dim_second - dim2_count);
+    if(dim1_count>3){
+      y1 = _mm256_loadu_pd(src1); src1 += 4;
+      _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second;
+      dim1_count -= 4;
+    }
+    dst1 -= 2 * (dim_second - dim2_count);
+    if(dim1_count>1){
+      x1 = _mm_loadu_pd(src1); src1 += 2;
+      _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second;
+      dim1_count -= 2;
+    }
+    dst1 -= dim_second - dim2_count;
+    if(dim1_count>0){
+      s1 = *src1; src1++;
+      dst1[0] = s1;
+    }
+    src1 += src_inc;
+  }
+}
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 6d33641e9..da68db0cd 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dgemv_n_microk_nehalem-4.c"
 #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "dgemv_n_microk_haswell-4.c"
-#elif  defined (SKYLAKEX)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "dgemv_n_microk_skylakex-4.c"
 #endif
 
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index ed672a757..a3bf28dc8 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "dgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index e2436f789..d1270d20b 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dscal_microk_sandy-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dscal_microk_haswell-2.c"
-#elif  defined (SKYLAKEX)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "dscal_microk_skylakex-2.c"
 #endif
 
diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c
index a722cc9df..573377ee0 100644
--- a/kernel/x86_64/dsymv_L.c
+++ b/kernel/x86_64/dsymv_L.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dsymv_L_microk_bulldozer-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dsymv_L_microk_haswell-2.c"
-#elif defined (SKYLAKEX)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "dsymv_L_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
index bdcd914fb..f0df5aaa8 100644
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@@ -36,7 +36,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0
 #pragma clang optimize off
 #endif
-
+#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3
+#pragma clang optimize off
+#endif
 static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
 {
 
@@ -164,6 +166,9 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0
 #pragma clang optimize on
 #endif
+#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3
+#pragma clang optimize on
+#endif
 
 #else
 #include "dsymv_L_microk_haswell-2.c"
diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c
index 431e4bb3f..530ac8b1d 100644
--- a/kernel/x86_64/dsymv_U.c
+++ b/kernel/x86_64/dsymv_U.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "dsymv_U_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "dsymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c
index e1349da58..7b2845636 100644
--- a/kernel/x86_64/saxpy.c
+++ b/kernel/x86_64/saxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "saxpy_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "saxpy_microk_haswell-2.c"
-#elif defined (SKYLAKEX)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "saxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "saxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index 3536afc9e..e816c67e9 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sdot_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "sdot_microk_haswell-2.c"
-#elif  defined (SKYLAKEX)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "sdot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "sdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/sgemm_direct_performant.c b/kernel/x86_64/sgemm_direct_performant.c
new file mode 100644
index 000000000..5a20ce395
--- /dev/null
+++ b/kernel/x86_64/sgemm_direct_performant.c
@@ -0,0 +1,30 @@
+#include "common.h"
+/* helper for the direct sgemm code written by Arjan van der Ven */
+
+
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
+{
+	unsigned long long mnk = M * N * K;
+	/* large matrixes -> not performant */
+	if (mnk >= 28 * 512 * 512)
+		return 0;
+
+	/*
+	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
+	 * and the regular sgemm copy/realignment of data pays off much quicker
+	 */
+	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
+		return 0;
+
+#ifdef SMP
+	/* if we can run multithreaded, the threading changes the based threshold */
+	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+		return 0;
+#endif
+
+	return 1;
+}
+
+
diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index 0e8f1318f..a7cddbb3d 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,7 +1,7 @@
-
+#if defined(SKYLAKEX) || defined (COOPERLAKE)
 /* the direct sgemm code written by Arjan van der Ven */
-//#include <immintrin.h>
-
+#include <immintrin.h>
+#include "common.h"
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs
  * of the sgemm call, avoiding the copies, memory realignments and threading,
@@ -38,6 +38,7 @@
 #define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
 #define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
 
+#if 0
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
 	unsigned long long mnk = M * N * K;
@@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 	return 1;
 }
 
+#endif
 
-
-void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {
 	int i, j, k;
 
@@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
 		}
 	}
 }
+#else
+#include "common.h"
+void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+{}
+#endif
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
index 3b1af33c1..f3d614242 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
@@ -512,4 +512,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     return 0;
 }
 #include <immintrin.h>
-#include "sgemm_direct_skylakex.c"
+//#include "sgemm_direct_skylakex.c"
diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c
index 5ab3e6d1f..a2e78c58d 100644
--- a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c
+++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c
@@ -1,4 +1,4 @@
-﻿/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
 /* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */
 
 /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 63697970f..3eec21774 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_nehalem-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "sgemv_n_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 86ecaf516..fe886f57f 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "sgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 73ae001ea..c9d698eb7 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_L_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_L_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "ssymv_L_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index f37c251a1..4d8aac1ab 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_U_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_U_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "ssymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c
index 4131debb1..5410bd4ae 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c
@@ -1,4 +1,4 @@
-﻿#include "common.h"
+#include "common.h"
 #include <stdint.h>
 #include "strsm_kernel_8x4_haswell_L_common.h"
 
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h
index cfa56da97..2862a5b8d 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h
@@ -1,4 +1,4 @@
-﻿/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
+/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
 /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
 /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
 
diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S
index 8a5c44c9b..fea4fc746 100644
--- a/kernel/x86_64/symv_L_sse.S
+++ b/kernel/x86_64/symv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S
index 0c40a3435..b853ef365 100644
--- a/kernel/x86_64/symv_L_sse2.S
+++ b/kernel/x86_64/symv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S
index 7a2eeace5..bad367e91 100644
--- a/kernel/x86_64/symv_U_sse.S
+++ b/kernel/x86_64/symv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S
index 0408b577c..147201751 100644
--- a/kernel/x86_64/symv_U_sse2.S
+++ b/kernel/x86_64/symv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c
index 53866cf95..25e9f6d42 100644
--- a/kernel/x86_64/zaxpy.c
+++ b/kernel/x86_64/zaxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zaxpy_microk_bulldozer-2.c"
 #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "zaxpy_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "zaxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zaxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 01169e8e6..1bc785ac1 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
 #include "zdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "zdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zdot_microk_sandy-2.c"
@@ -140,8 +140,8 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO
 		i=0;
 		ix=0;
 		iy=0;
-		inc_x <<= 1;
-		inc_y <<= 1;
+		inc_x *= 2;
+		inc_y *= 2;
 		while(i < n)
 		{
 
@@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO
 
 #if defined(SMP)
 static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
-BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
         zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 0fedc496b..1f9d41859 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "zgemv_n_microk_haswell-4.c"
 #elif defined(SANDYBRIDGE)
 #include "zgemv_n_microk_sandy-4.c"
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 2ab7a671b..34f28b224 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "zgemv_t_microk_bulldozer-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "zgemv_t_microk_haswell-4.c"
 #endif
 
@@ -235,9 +235,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
 
-        inc_x <<= 1;
-        inc_y <<= 1;
-        lda   <<= 1;
+        inc_x *= 2;
+        inc_y *= 2;
+        lda  <<= 1;
 	lda4    = lda << 2;
 
 	xbuffer = buffer;
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 2a6d0e4c7..09a702a81 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "zscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "zscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S
index e44bd7550..83ed41ba1 100644
--- a/kernel/x86_64/zsymv_L_sse.S
+++ b/kernel/x86_64/zsymv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S
index e9f330c36..7ed2faf0f 100644
--- a/kernel/x86_64/zsymv_L_sse2.S
+++ b/kernel/x86_64/zsymv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S
index 9f0dead18..5945f3f81 100644
--- a/kernel/x86_64/zsymv_U_sse.S
+++ b/kernel/x86_64/zsymv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S
index b6106a37d..484d74f14 100644
--- a/kernel/x86_64/zsymv_U_sse2.S
+++ b/kernel/x86_64/zsymv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index f6e3bec23..3510938a7 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -86,24 +86,24 @@ DGEMVTKERNEL = dgemv_t_4.c
 CGEMVTKERNEL = cgemv_t_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
-STRMMKERNEL	= strmm8x4V.S
-DTRMMKERNEL	= trmm8x4V.S
+STRMMKERNEL	= gemm_vec.c
+DTRMMKERNEL	= gemm_vec.c
 CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
 
-SGEMMKERNEL    =  strmm8x4V.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
-SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
-SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+SGEMMKERNEL    = gemm_vec.c
+ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
+SGEMMINCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-
- 
-DGEMMKERNEL    =  gemm8x4V.S
+DGEMMKERNEL    = gemm_vec.c
 DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
@@ -145,7 +145,3 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
-
-
-
-
diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c
new file mode 100644
index 000000000..741c09431
--- /dev/null
+++ b/kernel/zarch/gemm_vec.c
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) IBM Corporation 2020.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *    3. Neither the name of the OpenBLAS project nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "common.h"
+#include <vecintrin.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef COMPLEX
+#error "Handling for complex numbers is not supported in this kernel"
+#endif
+
+#ifdef DOUBLE
+#define UNROLL_M DGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N DGEMM_DEFAULT_UNROLL_N
+#else
+#define UNROLL_M SGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N SGEMM_DEFAULT_UNROLL_N
+#endif
+
+static const size_t unroll_m = UNROLL_M;
+static const size_t unroll_n = UNROLL_N;
+
+/* Handling of triangular matrices */
+#ifdef TRMMKERNEL
+static const bool trmm = true;
+static const bool left =
+#ifdef LEFT
+	true;
+#else
+	false;
+#endif
+
+static const bool backwards =
+#if defined(LEFT) != defined(TRANSA)
+	true;
+#else
+	false;
+#endif
+
+#else
+static const bool trmm = false;
+static const bool left = false;
+static const bool backwards = false;
+#endif /* TRMMKERNEL */
+
+/*
+ * Background:
+ *
+ * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
+ * problem by splitting all matrices into partitions multiple times, so that the
+ * submatrices fit into the L1 or L2 caches. As a result, each multiplication of
+ * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
+ * and rearranges the submatrices to enable contiguous memory accesses to
+ * improve locality in both caches and TLBs.
+ *
+ * At the heart of the algorithm is this kernel, which multiplies, a "Block
+ * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
+ * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
+ * operation GEBP. This kernel further partitions GEBP twice, such that (1)
+ * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
+ * block of C fits into the registers, while multiplying panels from A and B
+ * streamed from the L2 and L1 cache, respectively (GEBP_block).
+ *
+ *
+ * Algorithm GEBP(A, B, C, m, n, k, alpha):
+ *
+ * The problem is calculating C += alpha * (A * B)
+ * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
+ *
+ * - C is in column-major-order, with an offset of ldc to the element in the
+ *   next column (same row).
+ * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
+ *   contiguously while walking along rows.
+ * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
+ *   contiguously.
+ * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
+ * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
+ * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
+ * block-of-1).
+ *
+ * Note that packing A and B into that form is taken care of by the caller in
+ * driver/level3/level3.c (actually done by "copy kernels").
+ *
+ * Steps:
+ * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
+ *   Now, B_j should fit into the L1 cache.
+ * - For each partition, calculate C_j += alpha * (A * B_j) by
+ *     (1) Calculate C_aux := A * B_j (see below)
+ *     (2) unpack C_j = C_j + alpha * C_aux
+ *
+ *
+ * Algorithm for Calculating C_aux:
+ *
+ * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
+ *   such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
+ *   submatrix of C_aux can be calculated independently, and the registers are
+ *   added back into C_j.
+ *
+ * - For each row-block of C_aux:
+ *   (uses a row block of A and full B_j)
+ *    - stream over all columns of A, multiply with elements from B and
+ *      accumulate in registers. (use different inner-kernels to exploit
+ *      vectorization for varying block sizes)
+ *    - add alpha * row block of C_aux back into C_j.
+ *
+ * Note that there are additional mechanics for handling triangular matrices,
+ * calculating B := alpha (A * B) where either of the matrices A or B can be
+ * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
+ * optionally be transposed.
+ * The code effectively skips an "offset" number of columns in A and rows of B
+ * in each block, to save unnecessary work by exploiting the triangular nature.
+ * To handle all cases, the code discerns (1) a "left" mode when A is triangular
+ * and (2) "forward" / "backwards" modes where only the first "offset"
+ * columns/rows of A/B are used or where the first "offset" columns/rows are
+ * skipped, respectively.
+ *
+ * Reference:
+ *
+ * The summary above is based on staring at various kernel implementations and:
+ * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
+ * Multiplication, in ACM Transactions of Mathematical Software, Vol.  34, No.
+ * 3, May 2008.
+ */
+
+#define VLEN_BYTES 16
+#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
+
+typedef FLOAT vector_float __attribute__ ((vector_size (16)));
+
+/**
+ * Load a vector into register, and hint on 8-byte alignment to improve
+ * performance. gcc-9 and newer will create these hints by itself. For older
+ * compiler versions, use inline assembly to explicitly express the hint.
+ * Provide explicit hex encoding to cater for binutils versions that do not know
+ * about vector-load with alignment hints yet.
+ *
+ * Note that, for block sizes where we apply vectorization, vectors in A will
+ * always be 8-byte aligned.
+ */
+static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
+	vector_float const *restrict addr = (vector_float const *restrict)a;
+	vector_float y;
+
+#if __GNUC__ < 9
+	// hex-encode vl %[out],%[addr],3
+	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
+	    : [ out ] "=v"(y)
+	    : [ addr ] "R"(*addr));
+#else
+	y = *addr;
+#endif
+
+	return y;
+}
+
+/**
+ * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
+ *
+ * @param[in] 	A	Pointer current block of input matrix A.
+ * @param[in]	k	Number of columns in A.
+ * @param[in]	B	Pointer current block of input matrix B.
+ * @param[inout] C	Pointer current block of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+#define VECTOR_BLOCK(ROWS, COLS)                                              \
+	static inline void GEBP_block_##ROWS##_##COLS(                        \
+	    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,    \
+	    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
+		_Static_assert(                                               \
+		    ROWS % VLEN_FLOATS == 0,                                  \
+		    "rows in block must be multiples of vector length");      \
+		vector_float Caux[ROWS / VLEN_FLOATS][COLS];                  \
+                                                                              \
+		for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {           \
+			vector_float A0 =                                     \
+			    vec_load_hinted(A + i * VLEN_FLOATS);             \
+			for (BLASLONG j = 0; j < COLS; j++)                   \
+				Caux[i][j] = A0 * B[j];                       \
+		}                                                             \
+                                                                              \
+		/*                                                            \
+		 * Stream over the row-block of A, which is packed            \
+		 * column-by-column, multiply by coefficients in B and add up \
+		 * into temporaries Caux (which the compiler will hold in     \
+		 * registers). Vectorization: Multiply column vectors from A  \
+		 * with scalars from B and add up in column vectors of Caux.  \
+		 * That equates to unrolling the loop over rows (in i) and    \
+		 * executing each unrolled iteration as a vector element.     \
+		 */                                                           \
+		for (BLASLONG k = 1; k < bk; k++) {                           \
+			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
+				vector_float Ak = vec_load_hinted(            \
+				    A + i * VLEN_FLOATS + k * ROWS);          \
+                                                                              \
+				for (BLASLONG j = 0; j < COLS; j++)           \
+					Caux[i][j] += Ak * B[j + k * COLS];   \
+			}                                                     \
+		}                                                             \
+                                                                              \
+		/*                                                            \
+		 * Unpack row-block of C_aux into outer C_i, multiply by      \
+		 * alpha and add up.                                          \
+		 */                                                           \
+		for (BLASLONG j = 0; j < COLS; j++) {                         \
+			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
+				vector_float *C_ij =                          \
+				    (vector_float *)(C + i * VLEN_FLOATS +    \
+						     j * ldc);                \
+				if (trmm) {                                   \
+					*C_ij = alpha * Caux[i][j];           \
+				} else {                                      \
+					*C_ij += alpha * Caux[i][j];          \
+				}                                             \
+			}                                                     \
+		}                                                             \
+	}
+
+
+#if UNROLL_M == 16
+VECTOR_BLOCK(16, 2)
+VECTOR_BLOCK(16, 1)
+#endif
+#if UNROLL_N == 8
+VECTOR_BLOCK(8, 8)
+VECTOR_BLOCK(4, 8)
+#endif
+#ifndef DOUBLE
+VECTOR_BLOCK(8, 4)
+#endif
+VECTOR_BLOCK(8, 2)
+VECTOR_BLOCK(8, 1)
+VECTOR_BLOCK(4, 4)
+VECTOR_BLOCK(4, 2)
+VECTOR_BLOCK(4, 1)
+
+/**
+ * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
+ * Simple implementation for smaller block sizes
+ *
+ * @param[in] 	A	Pointer current block of input matrix A.
+ * @param[in]	k	Number of columns in A.
+ * @param[in]	B	Pointer current block of input matrix B.
+ * @param[inout] C	Pointer current block of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ */
+#define SCALAR_BLOCK(ROWS, COLS)                                          \
+    static inline void GEBP_block_##ROWS##_##COLS(                        \
+	FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B,     \
+	FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
+	FLOAT Caux[ROWS][COLS] __attribute__((aligned(16)));              \
+                                                                          \
+	/*                                                                \
+	 * Peel off first iteration (i.e., column of A) for               \
+	 * initializing Caux                                              \
+	 */                                                               \
+	for (BLASLONG i = 0; i < ROWS; i++)                               \
+	    for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
+                                                                          \
+	for (BLASLONG kk = 1; kk < k; kk++)                               \
+	    for (BLASLONG i = 0; i < ROWS; i++)                           \
+		for (BLASLONG j = 0; j < COLS; j++)                       \
+		    Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS];    \
+                                                                          \
+	for (BLASLONG i = 0; i < ROWS; i++)                               \
+	    for (BLASLONG j = 0; j < COLS; j++)                           \
+		if (trmm) {                                               \
+		    C[i + j * ldc] = alpha * Caux[i][j];                  \
+		} else {                                                  \
+		    C[i + j * ldc] += alpha * Caux[i][j];                 \
+		}                                                         \
+    }
+
+#ifdef DOUBLE
+VECTOR_BLOCK(2, 4)
+VECTOR_BLOCK(2, 2)
+VECTOR_BLOCK(2, 1)
+#else
+SCALAR_BLOCK(2, 4)
+SCALAR_BLOCK(2, 2)
+SCALAR_BLOCK(2, 1)
+#endif
+
+SCALAR_BLOCK(1, 4)
+SCALAR_BLOCK(1, 2)
+SCALAR_BLOCK(1, 1)
+
+
+/**
+ * Calculate a row-block that fits 4x4 vector registers using a loop
+ * unrolled-by-2 with explicit interleaving to better overlap loads and
+ * computation.
+ * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
+ */
+#ifdef DOUBLE
+static inline void GEBP_block_8_4(
+#else // float
+static inline void GEBP_block_16_4(
+#endif
+    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
+    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
+#define VEC_ROWS 4
+#define VEC_COLS 4
+#define ROWS VEC_ROWS * VLEN_FLOATS
+#define COLS (VEC_COLS)
+
+	/*
+	 * Hold intermediate results in vector registers.
+	 * Since we need to force the compiler's hand in places, we need to use
+	 * individual variables in contrast to the generic implementation's
+	 * arrays.
+	 */
+#define INIT_ROW_OF_C(ROW)                                        \
+    vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
+    vector_float C_##ROW##_0 = A##ROW * B[0];                     \
+    vector_float C_##ROW##_1 = A##ROW * B[1];                     \
+    vector_float C_##ROW##_2 = A##ROW * B[2];                     \
+    vector_float C_##ROW##_3 = A##ROW * B[3];
+
+	INIT_ROW_OF_C(0)
+	INIT_ROW_OF_C(1)
+	INIT_ROW_OF_C(2)
+	INIT_ROW_OF_C(3)
+#undef INIT_ROW_OF_C
+
+	if (bk > 1) {
+		BLASLONG k = 1;
+		vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
+		vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
+
+		/*
+		 * Note that in several places, we enforce an instruction
+		 * sequence that we identified empirically by utilizing dummy
+		 * asm statements.
+		 */
+
+		for (BLASLONG j = 0; j < VEC_COLS; j++)
+			Bk[j] = vec_splats(B[j + k * COLS]);
+		asm("");
+
+		for (BLASLONG i = 0; i < VEC_ROWS; i++)
+			Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
+
+		for (; k < (bk - 2); k += 2) {
+			/*
+			 * Load inputs for (k+1) into registers.
+			 * Loading from B first is advantageous.
+			 */
+			for (BLASLONG j = 0; j < VEC_COLS; j++)
+				Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
+			asm("");
+			for (BLASLONG i = 0; i < VEC_ROWS; i++)
+				Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
+						(k + 1) * ROWS);
+
+			/*
+			 * To achieve better instruction-level parallelism,
+			 * make sure to first load input data for (k+1) before
+			 * initiating compute for k. We enforce that ordering
+			 * with a pseudo asm statement.
+			 * Note that we need to massage this particular "barrier"
+			 * depending on the gcc version.
+			 */
+#if __GNUC__ > 7
+#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX)                                    \
+    do {                                                                       \
+	asm(""                                                                 \
+	    : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
+	      "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3)                            \
+	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
+	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
+	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
+	asm(""                                                                 \
+	    : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
+	      "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3)                            \
+	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
+	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
+	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
+    } while (0)
+#else // __GNUC__ <= 7
+#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
+    do {                                    \
+	asm("");                            \
+    } while (0)
+#endif
+
+			BARRIER_READ_BEFORE_COMPUTE(knext);
+
+			/* Compute for (k) */
+			C_0_0 += Ak[0] * Bk[0];
+			C_1_0 += Ak[1] * Bk[0];
+			C_2_0 += Ak[2] * Bk[0];
+			C_3_0 += Ak[3] * Bk[0];
+
+			C_0_1 += Ak[0] * Bk[1];
+			C_1_1 += Ak[1] * Bk[1];
+			C_2_1 += Ak[2] * Bk[1];
+			C_3_1 += Ak[3] * Bk[1];
+
+			C_0_2 += Ak[0] * Bk[2];
+			C_1_2 += Ak[1] * Bk[2];
+			C_2_2 += Ak[2] * Bk[2];
+			C_3_2 += Ak[3] * Bk[2];
+
+			C_0_3 += Ak[0] * Bk[3];
+			C_1_3 += Ak[1] * Bk[3];
+			C_2_3 += Ak[2] * Bk[3];
+			C_3_3 += Ak[3] * Bk[3];
+
+			asm("");
+
+			/*
+			 * Load inputs for (k+2) into registers.
+			 * First load from B.
+			 */
+			for (BLASLONG j = 0; j < VEC_COLS; j++)
+				Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
+			asm("");
+			for (BLASLONG i = 0; i < VEC_ROWS; i++)
+				Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
+
+			/*
+			 * As above, make sure to first schedule the loads for (k+2)
+			 * before compute for (k+1).
+			 */
+			BARRIER_READ_BEFORE_COMPUTE(k);
+
+			/* Compute on (k+1) */
+			C_0_0 += Aknext[0] * Bknext[0];
+			C_1_0 += Aknext[1] * Bknext[0];
+			C_2_0 += Aknext[2] * Bknext[0];
+			C_3_0 += Aknext[3] * Bknext[0];
+
+			C_0_1 += Aknext[0] * Bknext[1];
+			C_1_1 += Aknext[1] * Bknext[1];
+			C_2_1 += Aknext[2] * Bknext[1];
+			C_3_1 += Aknext[3] * Bknext[1];
+
+			C_0_2 += Aknext[0] * Bknext[2];
+			C_1_2 += Aknext[1] * Bknext[2];
+			C_2_2 += Aknext[2] * Bknext[2];
+			C_3_2 += Aknext[3] * Bknext[2];
+
+			C_0_3 += Aknext[0] * Bknext[3];
+			C_1_3 += Aknext[1] * Bknext[3];
+			C_2_3 += Aknext[2] * Bknext[3];
+			C_3_3 += Aknext[3] * Bknext[3];
+		}
+
+		/* Wrapup remaining k's */
+		for (; k < bk; k++) {
+			vector_float Ak;
+
+#define COMPUTE_WRAPUP_ROW(ROW)                             \
+    Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
+    C_##ROW##_0 += Ak * B[0 + k * COLS];                    \
+    C_##ROW##_1 += Ak * B[1 + k * COLS];                    \
+    C_##ROW##_2 += Ak * B[2 + k * COLS];                    \
+    C_##ROW##_3 += Ak * B[3 + k * COLS];
+
+			COMPUTE_WRAPUP_ROW(0)
+			COMPUTE_WRAPUP_ROW(1)
+			COMPUTE_WRAPUP_ROW(2)
+			COMPUTE_WRAPUP_ROW(3)
+#undef COMPUTE_WRAPUP_ROW
+		}
+	}
+
+	/*
+	 * Unpack row-block of C_aux into outer C_i, multiply by
+	 * alpha and add up (or assign for TRMM).
+	 */
+#define WRITE_BACK_C(ROW, COL)                                   \
+    do {                                                         \
+	vector_float *Cij =                                      \
+	    (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
+	if (trmm) {                                              \
+	    *Cij = alpha * C_##ROW##_##COL;                   \
+	} else {                                                 \
+	    *Cij += alpha * C_##ROW##_##COL;                  \
+	}                                                        \
+    } while (0)
+
+	WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
+	WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
+	WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
+	WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
+#undef WRITE_BACK_C
+
+#undef ROWS
+#undef VEC_ROWS
+#undef COLS
+#undef VEC_COLS
+#undef BARRIER_READ_BEFORE_COMPUTE
+}
+
+/**
+ * Handle calculation for row blocks in C_i of any size by dispatching into
+ * macro-defined (inline) functions or by deferring to a simple generic
+ * implementation. Note that the compiler can remove this awkward-looking
+ * dispatching code while inlineing.
+ *
+ * @param[in]	m	Number of rows in block C_i.
+ * @param[in]	n	Number of columns in block C_i.
+ * @param[in]	first_row Index of first row of the block C_i (relative to C).
+ * @param[in]	A	Pointer to input matrix A (note: all of it).
+ * @param[in]	k	Number of columns in A and rows in B.
+ * @param[in]	B	Pointer to current column block (panel) of input matrix B.
+ * @param[inout] C	Pointer to current column block (panel) of output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset  Number of columns of A and rows of B to skip (for triangular matrices).
+ * @param[in]	off	Running offset for handling triangular matrices.
+ */
+static inline void GEBP_block(BLASLONG m, BLASLONG n,
+		       BLASLONG first_row,
+		       const FLOAT * restrict A, BLASLONG k,
+		       const FLOAT * restrict B,
+		       FLOAT *restrict C, BLASLONG ldc,
+		       FLOAT alpha,
+		       BLASLONG offset, BLASLONG off)
+{
+	if (trmm && left)
+		off = offset + first_row;
+
+	A += first_row * k;
+	C += first_row;
+
+	if (trmm) {
+		if (backwards) {
+			A += off * m;
+			B += off * n;
+			k -= off;
+		} else {
+			if (left) {
+				k = off + m;
+			} else {
+				k = off + n;
+			}
+		}
+	}
+
+	/* Dispatch into the implementation for each block size: */
+
+#define BLOCK(bm, bn)                                           \
+	if (m == bm && n == bn) {                               \
+		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
+		return;                                         \
+	}
+
+#if UNROLL_M == 16
+	BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
+#endif
+#if UNROLL_N == 8
+	BLOCK(8, 8); BLOCK(4, 8);
+#endif
+	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
+	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
+
+	BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
+
+	BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
+
+#undef BLOCK
+}
+
+/**
+ * Handle a column block (panel) of C and B while calculating C += alpha(A * B).
+ *
+ * @param[in]	num_cols	Number of columns in the block (in C and B).
+ * @param[in]	first_col	First column of the current block (in C and B).
+ * @param[in]	A	Pointer to input matrix A.
+ * @param[in]	bk	Number of columns in A and rows in B.
+ * @param[in]	B	Pointer to input matrix B (note: all of it).
+ * @param[in]	bm	Number of rows in C and A.
+ * @param[inout] C	Pointer to output matrix C (note: all of it).
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	alpha	Scalar factor.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
+ */
+static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
+			const FLOAT *restrict A, BLASLONG bk,
+			const FLOAT *restrict B, BLASLONG bm,
+			FLOAT *restrict C, BLASLONG ldc,
+			FLOAT alpha,
+			BLASLONG const offset) {
+
+	FLOAT *restrict C_i = C + first_col * ldc;
+	/*
+	 * B is in column-order with n_r packed row elements, which does
+	 * not matter -- we always move in full such blocks of
+	 * column*pack
+	 */
+	const FLOAT *restrict B_i = B + first_col * bk;
+
+	BLASLONG off = 0;
+	if (trmm) {
+		if (left) {
+			off = offset;
+		} else {
+			off = -offset + first_col;
+		}
+	}
+
+	/*
+	 * Calculate C_aux := A * B_j
+	 * then unpack C_i += alpha * C_aux.
+	 *
+	 * For that purpose, further partition C_aux and A into blocks
+	 * of m_r (unroll_m) rows, or powers-of-2 if smaller.
+	 */
+	BLASLONG row = 0;
+	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
+		for (; bm - row >= block_size; row += block_size)
+			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
+				   ldc, alpha, offset, off);
+}
+
+/**
+ * Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
+ * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
+ * C are pointers to submatrices of the actual matrices.
+ *
+ * For triangular matrix multiplication, calculate B := alpha (A * B) where A
+ * or B can be triangular (in case of A, the macro LEFT will be defined).
+ *
+ * @param[in]	bm	Number of rows in C and A.
+ * @param[in]	bn	Number of columns in C and B.
+ * @param[in]	bk	Number of columns in A and rows in B.
+ * @param[in]	alpha	Scalar factor.
+ * @param[in]	ba	Pointer to input matrix A.
+ * @param[in]	bb	Pointer to input matrix B.
+ * @param[inout] C	Pointer to output matrix C.
+ * @param[in]	ldc	Offset between elements in adjacent columns in C.
+ * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
+ * @returns 0 on success.
+ */
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
+	  FLOAT *restrict ba, FLOAT *restrict bb,
+	  FLOAT *restrict C, BLASLONG ldc
+#ifdef TRMMKERNEL
+	  , BLASLONG offset
+#endif
+	  )
+{
+	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
+		return 0;
+
+	/*
+	 * interface code allocates buffers for ba and bb at page
+	 * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
+	 * to make use of the fact in vector load operations.
+	 */
+	ba = __builtin_assume_aligned(ba, 16);
+	bb = __builtin_assume_aligned(bb, 16);
+
+	/*
+	 * Use offset and off even when compiled as SGEMMKERNEL to simplify
+	 * function signatures and function calls.
+	 */
+#ifndef TRMMKERNEL
+	BLASLONG const offset = 0;
+#endif
+
+	/*
+	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
+	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
+	 *
+	 * For remaining columns that do not fill up a block of n_r, iteratively
+	 * use smaller block sizes of powers of 2.
+	 */
+	BLASLONG col = 0;
+	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
+		for (; bn - col >= block_size; col += block_size)
+			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
+
+   return 0;
+}
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 6eb0b696b..012c104bb 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                    
 /* APIs for set/get nancheck flags */
 void LAPACKE_set_nancheck( int flag );
-int LAPACKE_get_nancheck( );
+int LAPACKE_get_nancheck( void );
 
 #ifdef __cplusplus
 }
diff --git a/lapack-netlib/SRC/clargv.f b/lapack-netlib/SRC/clargv.f
index ba53cae6f..36c5108df 100644
--- a/lapack-netlib/SRC/clargv.f
+++ b/lapack-netlib/SRC/clargv.f
@@ -200,7 +200,7 @@
             FS = FS*SAFMN2
             GS = GS*SAFMN2
             SCALE = SCALE*SAFMN2
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
      $         GO TO 10
          ELSE IF( SCALE.LE.SAFMN2 ) THEN
             IF( G.EQ.CZERO ) THEN
diff --git a/lapack-netlib/SRC/clartg.f b/lapack-netlib/SRC/clartg.f
index da9a1cdef..baa68b657 100644
--- a/lapack-netlib/SRC/clartg.f
+++ b/lapack-netlib/SRC/clartg.f
@@ -161,7 +161,7 @@
          FS = FS*SAFMN2
          GS = GS*SAFMN2
          SCALE = SCALE*SAFMN2
-         IF( SCALE.GE.SAFMX2 )
+         IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
      $      GO TO 10
       ELSE IF( SCALE.LE.SAFMN2 ) THEN
          IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN
diff --git a/lapack-netlib/SRC/dlartg.f b/lapack-netlib/SRC/dlartg.f
index 1c7c46f63..dc49986a0 100644
--- a/lapack-netlib/SRC/dlartg.f
+++ b/lapack-netlib/SRC/dlartg.f
@@ -163,7 +163,7 @@
             F1 = F1*SAFMN2
             G1 = G1*SAFMN2
             SCALE = MAX( ABS( F1 ), ABS( G1 ) )
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
      $         GO TO 10
             R = SQRT( F1**2+G1**2 )
             CS = F1 / R
diff --git a/lapack-netlib/SRC/dlartgp.f b/lapack-netlib/SRC/dlartgp.f
index 0cb0d2d13..334e416e8 100644
--- a/lapack-netlib/SRC/dlartgp.f
+++ b/lapack-netlib/SRC/dlartgp.f
@@ -161,7 +161,7 @@
             F1 = F1*SAFMN2
             G1 = G1*SAFMN2
             SCALE = MAX( ABS( F1 ), ABS( G1 ) )
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
      $         GO TO 10
             R = SQRT( F1**2+G1**2 )
             CS = F1 / R
diff --git a/lapack-netlib/SRC/slartg.f b/lapack-netlib/SRC/slartg.f
index 784d4bc36..307c9c83a 100644
--- a/lapack-netlib/SRC/slartg.f
+++ b/lapack-netlib/SRC/slartg.f
@@ -163,7 +163,7 @@
             F1 = F1*SAFMN2
             G1 = G1*SAFMN2
             SCALE = MAX( ABS( F1 ), ABS( G1 ) )
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
      $         GO TO 10
             R = SQRT( F1**2+G1**2 )
             CS = F1 / R
diff --git a/lapack-netlib/SRC/slartgp.f b/lapack-netlib/SRC/slartgp.f
index ad76c94b4..f8be5f52b 100644
--- a/lapack-netlib/SRC/slartgp.f
+++ b/lapack-netlib/SRC/slartgp.f
@@ -161,7 +161,7 @@
             F1 = F1*SAFMN2
             G1 = G1*SAFMN2
             SCALE = MAX( ABS( F1 ), ABS( G1 ) )
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
      $         GO TO 10
             R = SQRT( F1**2+G1**2 )
             CS = F1 / R
diff --git a/lapack-netlib/SRC/zheequb.f b/lapack-netlib/SRC/zheequb.f
index d698232e8..7d719f41e 100644
--- a/lapack-netlib/SRC/zheequb.f
+++ b/lapack-netlib/SRC/zheequb.f
@@ -271,7 +271,7 @@
          AVG = AVG / N
 
          STD = 0.0D0
-         DO I = N+1, N
+         DO I = N+1, 2*N
             WORK( I ) = S( I-N ) * WORK( I-N ) - AVG
          END DO
          CALL ZLASSQ( N, WORK( N+1 ), 1, SCALE, SUMSQ )
diff --git a/lapack-netlib/SRC/zlargv.f b/lapack-netlib/SRC/zlargv.f
index 1e17983d5..f83ca1851 100644
--- a/lapack-netlib/SRC/zlargv.f
+++ b/lapack-netlib/SRC/zlargv.f
@@ -201,7 +201,7 @@
             FS = FS*SAFMN2
             GS = GS*SAFMN2
             SCALE = SCALE*SAFMN2
-            IF( SCALE.GE.SAFMX2 )
+            IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
      $         GO TO 10
          ELSE IF( SCALE.LE.SAFMN2 ) THEN
             IF( G.EQ.CZERO ) THEN
diff --git a/lapack-netlib/SRC/zlartg.f b/lapack-netlib/SRC/zlartg.f
index 8989bb896..894b4ded0 100644
--- a/lapack-netlib/SRC/zlartg.f
+++ b/lapack-netlib/SRC/zlartg.f
@@ -161,7 +161,7 @@
          FS = FS*SAFMN2
          GS = GS*SAFMN2
          SCALE = SCALE*SAFMN2
-         IF( SCALE.GE.SAFMX2 )
+         IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
      $      GO TO 10
       ELSE IF( SCALE.LE.SAFMN2 ) THEN
          IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN
diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index d5ca95013..80e6b3232 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -1,3 +1,7 @@
+enable_language(Fortran)
+
+enable_testing()
+
 if(MSVC_VERSION)
 #  string(REPLACE "/STACK:10000000" "/STACK:900000000000000000"
 #    CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
@@ -168,3 +172,394 @@ if(PYTHONINTERP_FOUND)
     COMMAND ${PYTHON_EXECUTABLE} "lapack_testing.py"
     )
 endif()
+
+
+
+# $1 exec, $2 input, $3 output_result
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
+"rm -f $3\n"
+"$1 < $2\n"
+"grep -q FATAL $3\n"
+"if [ $? -eq 0 ]; then\n"
+"echo Error\n"
+"exit 1\n"
+"else\n"
+"exit 0\n"
+"fi\n"
+)
+
+
+add_test(NAME "REAL_LAPACK_linear_equation_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
+)
+add_test(NAME "COMPLEX_LAPACK_linear_equation_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
+)
+add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
+)
+add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
+)
+
+add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
+)
+# ======== COMPLEX-COMPLEX16 LIN TESTS ========================
+
+add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines"
+	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
+)
+
+# ======== SINGLE RFP LIN TESTS ========================
+
+add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines"
+	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
+)
+
+# ======== COMPLEX16 RFP LIN TESTS ========================
+
+add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines"
+	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
+)
+# ======== COMPLEX16 RFP LIN TESTS ========================
+
+add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines"
+	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
+)
+
+# ======== COMPLEX16 RFP LIN TESTS ========================
+
+add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
+	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
+)
+#
+#
+# ======== SINGLE EIG TESTS ===========================
+#
+
+add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
+)
+
+add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
+)
+
+add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
+)
+
+add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
+)
+
+add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
+)
+
+add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
+)
+
+add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
+)
+
+add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
+)
+
+add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
+)
+
+add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
+)
+
+add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
+)
+
+add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
+)
+
+add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
+)
+
+add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
+)
+
+add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
+)
+
+add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
+)
+
+add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
+)
+
+add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
+)
+
+add_test(NAME "SCSD:_Testing_CS_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
+)
+
+add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
+)
+
+# ======== COMPLEX EIG TESTS ===========================
+
+add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
+)
+
+add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
+)
+
+add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
+)
+
+add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
+)
+
+add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
+)
+
+add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
+)
+
+add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
+)
+
+add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
+)
+
+add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
+)
+
+add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
+)
+
+add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
+)
+
+add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
+)
+
+add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
+)
+
+add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
+)
+
+add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
+)
+
+add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
+)
+
+add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
+)
+
+add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
+)
+
+add_test(NAME "CCSD:_Testing_CS_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
+)
+
+add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
+)
+
+# ======== DOUBLE EIG TESTS ===========================
+
+add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
+)
+
+add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
+)
+
+add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
+)
+
+add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
+)
+
+add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
+)
+
+add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
+)
+
+add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
+)
+
+add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
+)
+
+add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
+)
+
+add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
+)
+
+add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
+)
+
+add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
+)
+
+add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
+)
+
+add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
+)
+
+add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
+)
+
+add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
+)
+
+add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
+)
+
+add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
+)
+
+add_test(NAME "DCSD:_Testing_CS_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
+)
+
+add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
+)
+
+# ======== COMPLEX16 EIG TESTS ===========================
+
+add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
+)
+
+add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
+)
+
+add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
+)
+
+add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
+)
+
+add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
+)
+
+add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
+)
+
+add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
+)
+
+add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
+)
+
+add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
+)
+
+add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
+)
+
+add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
+)
+
+add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
+)
+
+add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
+)
+
+add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
+)
+
+add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
+)
+
+add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
+)
+
+add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
+)
+
+add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
+)
+
+add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
+)
+
+add_test(NAME "Constrained_Linear_Least_Squares_routines"
+	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
+)
diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index 20fd25b4a..e877b1422 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f
 
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
 endmacro()
 
 if(BUILD_SINGLE)
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index c941d3577..0d0bb5418 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)
diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f
index d24e3885b..f43c10b72 100644
--- a/lapack-netlib/TESTING/LIN/cdrvls.f
+++ b/lapack-netlib/TESTING/LIN/cdrvls.f
@@ -372,13 +372,13 @@
                         END IF
 *                       Compute workspace needed for CGELSY
                         CALL CGELSY( M, N, NRHS, A, LDA, B, LDB,
-     $                               IWQ, RCOND, CRANK, WQ, -1, RWORK,
+     $                               IWQ, RCOND, CRANK, WQ, -1, RWQ,
      $                               INFO )
                         LWORK_CGELSY = INT( WQ( 1 ) )
                         LRWORK_CGELSY = 2*N
 *                       Compute workspace needed for CGELSS
                         CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S,
-     $                               RCOND, CRANK, WQ, -1, RWORK, INFO )
+     $                               RCOND, CRANK, WQ, -1, RWQ, INFO )
                         LWORK_CGELSS = INT( WQ( 1 ) )
                         LRWORK_CGELSS = 5*MNMIN
 *                       Compute workspace needed for CGELSD
@@ -564,7 +564,7 @@
                                  CALL CLARNV( 2, ISEED, NCOLS*NRHS,
      $                                        WORK )
                                  CALL CSCAL( NCOLS*NRHS,
-     $                                       ONE / REAL( NCOLS ), WORK,
+     $                                       CONE / REAL( NCOLS ), WORK,
      $                                       1 )
                               END IF
                               CALL CGEMM( TRANS, 'No transpose', NROWS,
diff --git a/lapack-netlib/TESTING/LIN/derrtsqr.f b/lapack-netlib/TESTING/LIN/derrtsqr.f
index c8ad30257..d1d0ff02d 100644
--- a/lapack-netlib/TESTING/LIN/derrtsqr.f
+++ b/lapack-netlib/TESTING/LIN/derrtsqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ),
-     $                   C( NMAX, NMAX ), TAU(NMAX)
+     $                   C( NMAX, NMAX ), TAU(NMAX*2)
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, DGEQR,
@@ -137,6 +137,8 @@
 *
       TAU(1)=1
       TAU(2)=1
+      TAU(3)=1
+      TAU(4)=1
       SRNAMT = 'DGEMQR'
       NB=1
       INFOT = 1
diff --git a/lapack-netlib/TESTING/LIN/serrtsqr.f b/lapack-netlib/TESTING/LIN/serrtsqr.f
index f00f3e14b..7f91a3c39 100644
--- a/lapack-netlib/TESTING/LIN/serrtsqr.f
+++ b/lapack-netlib/TESTING/LIN/serrtsqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. Local Arrays ..
       REAL               A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ),
-     $                   C( NMAX, NMAX ), TAU(NMAX)
+     $                   C( NMAX, NMAX ), TAU(NMAX*2)
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, SGEQR,
@@ -137,6 +137,8 @@
 *
       TAU(1)=1
       TAU(2)=1
+      TAU(3)=1
+      TAU(4)=1
       SRNAMT = 'SGEMQR'
       NB=1
       INFOT = 1
diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f
index 4587c5686..1313c853b 100644
--- a/lapack-netlib/TESTING/LIN/zdrvls.f
+++ b/lapack-netlib/TESTING/LIN/zdrvls.f
@@ -372,12 +372,12 @@
                         END IF
 *                       Compute workspace needed for ZGELSY
                         CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ,
-     $                               RCOND, CRANK, WQ, -1, RWORK, INFO )
+     $                               RCOND, CRANK, WQ, -1, RWQ, INFO )
                         LWORK_ZGELSY = INT( WQ( 1 ) )
                         LRWORK_ZGELSY = 2*N
 *                       Compute workspace needed for ZGELSS
                         CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S,
-     $                               RCOND, CRANK, WQ, -1 , RWORK,
+     $                               RCOND, CRANK, WQ, -1 , RWQ,
      $                               INFO )
                         LWORK_ZGELSS = INT( WQ( 1 ) )
                         LRWORK_ZGELSS = 5*MNMIN
@@ -564,7 +564,7 @@
                                  CALL ZLARNV( 2, ISEED, NCOLS*NRHS,
      $                                        WORK )
                                  CALL ZSCAL( NCOLS*NRHS,
-     $                                       ONE / DBLE( NCOLS ), WORK,
+     $                                       CONE / DBLE( NCOLS ), WORK,
      $                                       1 )
                               END IF
                               CALL ZGEMM( TRANS, 'No transpose', NROWS,
diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index c602822a8..fc410b0e7 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -68,7 +68,7 @@ double sqrt(double);
 #define GETRF_FACTOR 1.00
 
 
-#if (__STDC_VERSION__ >= 201112L)
+#ifdef HAVE_C11
 #define	atomic_load_long(p)		__atomic_load_n(p, __ATOMIC_RELAXED)
 #define	atomic_store_long(p, v)		__atomic_store_n(p, v, __ATOMIC_RELAXED)
 #else
diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c
deleted file mode 100644
index 312509685..000000000
--- a/lapack/getrf/potrf_parallel.c
+++ /dev/null
@@ -1,667 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-
-#ifndef USE_SIMPLE_THREADED_LEVEL3
-
-//The array of job_t may overflow the stack.
-//Instead, use malloc to alloc job_t.
-#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
-#define USE_ALLOC_HEAP
-#endif
-
-
-static FLOAT dm1 = -1.;
-
-#ifndef KERNEL_FUNC
-#ifndef LOWER
-#define KERNEL_FUNC SYRK_KERNEL_U
-#else
-#define KERNEL_FUNC SYRK_KERNEL_L
-#endif
-#endif
-
-#ifndef LOWER
-#ifndef COMPLEX
-#define TRSM_KERNEL   TRSM_KERNEL_LT
-#else
-#define TRSM_KERNEL   TRSM_KERNEL_LC
-#endif
-#else
-#ifndef COMPLEX
-#define TRSM_KERNEL   TRSM_KERNEL_RN
-#else
-#define TRSM_KERNEL   TRSM_KERNEL_RR
-#endif
-#endif
-
-#ifndef CACHE_LINE_SIZE
-#define CACHE_LINE_SIZE 8
-#endif
-
-#ifndef DIVIDE_RATE
-#define DIVIDE_RATE 2
-#endif
-
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
-#ifndef LOWER
-#define TRANS
-#endif
-
-#ifndef SYRK_LOCAL
-#if   !defined(LOWER) && !defined(TRANS)
-#define SYRK_LOCAL    SYRK_UN
-#elif !defined(LOWER) &&  defined(TRANS)
-#define SYRK_LOCAL    SYRK_UT
-#elif  defined(LOWER) && !defined(TRANS)
-#define SYRK_LOCAL    SYRK_LN
-#else
-#define SYRK_LOCAL    SYRK_LT
-#endif
-#endif
-
-typedef struct {
-#if __STDC_VERSION__ >= 201112L
-  _Atomic 
-#else
-  volatile 
-#endif
-  BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
-} job_t;
-
-
-#ifndef KERNEL_OPERATION
-#ifndef COMPLEX
-#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
-	KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
-#else
-#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
-	KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
-#endif
-#endif
-
-#ifndef ICOPY_OPERATION
-#ifndef TRANS
-#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
-#else
-#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
-#endif
-#endif
-
-#ifndef OCOPY_OPERATION
-#ifdef TRANS
-#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
-#else
-#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
-#endif
-#endif
-
-#ifndef S
-#define S	args -> a
-#endif
-#ifndef A
-#define A	args -> b
-#endif
-#ifndef C
-#define C	args -> c
-#endif
-#ifndef LDA
-#define LDA	args -> lda
-#endif
-#ifndef N
-#define N	args -> m
-#endif
-#ifndef K
-#define K	args -> k
-#endif
-
-static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
-
-  FLOAT *buffer[DIVIDE_RATE];
-
-  BLASLONG k, lda;
-  BLASLONG m_from, m_to;
-
-  FLOAT *alpha;
-  FLOAT *a, *c;
-  job_t *job = (job_t *)args -> common;
-  BLASLONG xxx, bufferside;
-
-  BLASLONG jjs, min_jj;
-  BLASLONG is, min_i, div_n;
-
-  BLASLONG i, current;
-
-  k = K;
-
-  a = (FLOAT *)A;
-  c = (FLOAT *)C;
-
-  lda = LDA;
-
-  alpha = (FLOAT *)args -> alpha;
-
-  m_from = range_n[mypos + 0];
-  m_to   = range_n[mypos + 1];
-
-#if 0
-  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld\n",  mypos, m_from, m_to);
-#endif
-
-  div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
-
-  buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
-  for (i = 1; i < DIVIDE_RATE; i++) {
-    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
-  }
-
-#ifndef LOWER
-  TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
-#else
-  TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
-#endif
-
-  for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
-
-    for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
-
-      min_jj = MIN(m_to, xxx + div_n) - jjs;
-
-#ifndef LOWER
-      if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-#else
-      if (min_jj > GEMM_P)         min_jj = GEMM_P;
-#endif
-
-#ifndef LOWER
-      OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
-
-      TRSM_KERNEL     (k, min_jj, k, dm1,
-#ifdef COMPLEX
-		       ZERO,
-#endif
-		       sb,
-		       buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
-		       a + jjs * lda * COMPSIZE, lda, 0);
-#else
-      ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
-
-      TRSM_KERNEL     (min_jj, k, k, dm1,
-#ifdef COMPLEX
-		       ZERO,
-#endif
-		       buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
-		       sb,
-		       a + jjs       * COMPSIZE, lda, 0);
-#endif
-    }
-
-#ifndef LOWER
-    for (i = 0; i <= mypos; i++)
-      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-#else
-    for (i = mypos; i < args -> nthreads; i++)
-      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-#endif
-
-    WMB;
-  }
-
-  min_i = m_to - m_from;
-
-  if (min_i >= GEMM_P * 2) {
-    min_i = GEMM_P;
-  } else
-    if (min_i > GEMM_P) {
-      min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
-    }
-
-#ifndef LOWER
-  ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
-#else
-  OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
-#endif
-
-  current = mypos;
-
-#ifndef LOWER
-  while (current < args -> nthreads)
-#else
-  while (current >= 0)
-#endif
-    {
-      div_n = (((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
-
-      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
-	/* thread has to wait */
-	if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
-	KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
-			 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
-			 c, lda, m_from, xxx);
-
-	if (m_from + min_i >= m_to) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
-	  WMB;
-	}
-      }
-
-#ifndef LOWER
-      current ++;
-#else
-      current --;
-#endif
-    }
-
-  for(is = m_from + min_i; is < m_to; is += min_i){
-    min_i = m_to - is;
-
-    if (min_i >= GEMM_P * 2) {
-      min_i = GEMM_P;
-    } else
-      if (min_i > GEMM_P) {
-	min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
-      }
-
-#ifndef LOWER
-    ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
-#else
-    OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
-#endif
-
-    current = mypos;
-
-#ifndef LOWER
-    while (current < args -> nthreads)
-#else
-      while (current >= 0)
-#endif
-	{
-	  div_n = (((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
-
-	  for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
-	    KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
-			     sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
-			     c, lda, is, xxx);
-
-	    if (is + min_i >= m_to) {
-	      job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
-	      WMB;
-	    }
-	  }
-#ifndef LOWER
-	  current ++;
-#else
-	  current --;
-#endif
-	}
-  }
-
-  for (i = 0; i < args -> nthreads; i++) {
-    if (i != mypos) {
-      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
-	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
-      }
-    }
-  }
-
-  return 0;
-  }
-
-static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
-
-  blas_arg_t newarg;
-
-#ifndef USE_ALLOC_HEAP
-  job_t          job[MAX_CPU_NUMBER];
-#else
-  job_t *        job = NULL;
-#endif
-
-  blas_queue_t queue[MAX_CPU_NUMBER];
-
-  BLASLONG range[MAX_CPU_NUMBER + 100];
-
-  BLASLONG num_cpu;
-
-  BLASLONG nthreads = args -> nthreads;
-
-  BLASLONG width, i, j, k;
-  BLASLONG n, n_from, n_to;
-  int  mode, mask;
-  double dnum;
-
-#ifndef COMPLEX
-#ifdef XDOUBLE
-  mode  =  BLAS_XDOUBLE | BLAS_REAL;
-  mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
-#elif defined(DOUBLE)
-  mode  =  BLAS_DOUBLE  | BLAS_REAL;
-  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
-#elif defined(HALF)
-  mode  =  BLAS_HALF  | BLAS_REAL;
-  mask  = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
-#else
-  mode  =  BLAS_SINGLE  | BLAS_REAL;
-  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
-#endif
-#else
-#ifdef XDOUBLE
-  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
-  mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
-#elif defined(DOUBLE)
-  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
-  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
-#else
-  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
-  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
-#endif
-#endif
-
-  newarg.m        = args -> m;
-  newarg.k        = args -> k;
-  newarg.a        = args -> a;
-  newarg.b        = args -> b;
-  newarg.c        = args -> c;
-  newarg.lda      = args -> lda;
-  newarg.alpha    = args -> alpha;
-
-#ifdef USE_ALLOC_HEAP
-  job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
-  if(job==NULL){
-    fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
-    exit(1);
-  }
-#endif
-
-  newarg.common   = (void *)job;
-
-  n_from = 0;
-  n_to   = args -> m;
-
-#ifndef LOWER
-
-  range[MAX_CPU_NUMBER] = n_to - n_from;
-  range[0] = 0;
-  num_cpu  = 0;
-  i        = 0;
-  n        = n_to - n_from;
-
-  dnum = (double)n * (double)n /(double)nthreads;
-
-  while (i < n){
-
-    if (nthreads - num_cpu > 1) {
-
-      double di   = (double)i;
-
-      width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
-
-      if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
-
-      if ((width > n - i) || (width < mask)) width = n - i;
-
-    } else {
-      width = n - i;
-    }
-
-    range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
-
-    queue[num_cpu].mode    = mode;
-    queue[num_cpu].routine = inner_thread;
-    queue[num_cpu].args    = &newarg;
-    queue[num_cpu].range_m = NULL;
-
-    queue[num_cpu].sa      = NULL;
-    queue[num_cpu].sb      = NULL;
-    queue[num_cpu].next    = &queue[num_cpu + 1];
-
-    num_cpu ++;
-    i += width;
-  }
-
-   for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
-
-#else
-
-  range[0] = 0;
-  num_cpu  = 0;
-  i        = 0;
-  n        = n_to - n_from;
-
-  dnum = (double)n * (double)n /(double)nthreads;
-
-  while (i < n){
-
-    if (nthreads - num_cpu > 1) {
-
-	double di   = (double)i;
-
-	width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
-
-      if ((width > n - i) || (width < mask)) width = n - i;
-
-    } else {
-      width = n - i;
-    }
-
-    range[num_cpu + 1] = range[num_cpu] + width;
-
-    queue[num_cpu].mode    = mode;
-    queue[num_cpu].routine = inner_thread;
-    queue[num_cpu].args    = &newarg;
-    queue[num_cpu].range_m = NULL;
-    queue[num_cpu].range_n = range;
-    queue[num_cpu].sa      = NULL;
-    queue[num_cpu].sb      = NULL;
-    queue[num_cpu].next    = &queue[num_cpu + 1];
-
-    num_cpu ++;
-    i += width;
-  }
-
-#endif
-
-  newarg.nthreads = num_cpu;
-
-  if (num_cpu) {
-
-    for (j = 0; j < num_cpu; j++) {
-      for (i = 0; i < num_cpu; i++) {
-	for (k = 0; k < DIVIDE_RATE; k++) {
-	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
-	}
-      }
-    }
-
-    queue[0].sa = sa;
-    queue[0].sb = sb;
-    queue[num_cpu - 1].next = NULL;
-
-    exec_blas(num_cpu, queue);
-  }
-
-#ifdef USE_ALLOC_HEAP
-  free(job);
-#endif
-
-  return 0;
-}
-
-#endif
-
-blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
-
-  BLASLONG n, bk, i, blocking, lda;
-  BLASLONG info;
-  int mode;
-  blas_arg_t newarg;
-  FLOAT *a;
-  FLOAT alpha[2] = { -ONE, ZERO};
-
-#ifndef COMPLEX
-#ifdef XDOUBLE
-  mode  =  BLAS_XDOUBLE | BLAS_REAL;
-#elif defined(DOUBLE)
-  mode  =  BLAS_DOUBLE  | BLAS_REAL;
-#else
-  mode  =  BLAS_SINGLE  | BLAS_REAL;
-#endif
-#else
-#ifdef XDOUBLE
-  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
-#elif defined(DOUBLE)
-  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
-#else
-  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
-#endif
-#endif
-
-  if (args -> nthreads  == 1) {
-#ifndef LOWER
-    info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
-#else
-    info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
-#endif
-    return info;
-  }
-
-  n  = args -> n;
-  a  = (FLOAT *)args -> a;
-  lda = args -> lda;
-
-  if (range_n) n  = range_n[1] - range_n[0];
-
-  if (n <= GEMM_UNROLL_N * 2) {
-#ifndef LOWER
-    info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
-#else
-    info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
-#endif
-    return info;
-  }
-
-  newarg.lda = lda;
-  newarg.ldb = lda;
-  newarg.ldc = lda;
-  newarg.alpha = alpha;
-  newarg.beta = NULL;
-  newarg.nthreads = args -> nthreads;
-
-  blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
-  if (blocking > GEMM_Q) blocking = GEMM_Q;
-
-  for (i = 0; i < n; i += blocking) {
-    bk = n - i;
-    if (bk > blocking) bk = blocking;
-
-    newarg.m = bk;
-    newarg.n = bk;
-    newarg.a = a + (i + i * lda) * COMPSIZE;
-
-    info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
-    if (info) return info + i;
-
-    if (n - i - bk > 0) {
-#ifndef USE_SIMPLE_THREADED_LEVEL3
-      newarg.m = n - i - bk;
-      newarg.k = bk;
-#ifndef LOWER
-      newarg.b = a + ( i       + (i + bk) * lda) * COMPSIZE;
-#else
-      newarg.b = a + ((i + bk) +  i       * lda) * COMPSIZE;
-#endif
-      newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
-
-      thread_driver(&newarg, sa, sb);
-#else
-
-#ifndef LOWER
-    newarg.m = bk;
-    newarg.n = n - i - bk;
-    newarg.a = a + (i +  i       * lda) * COMPSIZE;
-    newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
-
-    gemm_thread_n(mode | BLAS_TRANSA_T,
-		  &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
-
-    newarg.n = n - i - bk;
-    newarg.k = bk;
-    newarg.a = a + ( i       + (i + bk) * lda) * COMPSIZE;
-    newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
-
-#if 0
-    HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
-#else
-    syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-                &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
-#endif
-#else
-    newarg.m = n - i - bk;
-    newarg.n = bk;
-    newarg.a = a + (i      + i * lda) * COMPSIZE;
-    newarg.b = a + (i + bk + i * lda) * COMPSIZE;
-
-    gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
-		  &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
-
-    newarg.n = n - i - bk;
-    newarg.k = bk;
-    newarg.a = a + (i + bk +  i       * lda) * COMPSIZE;
-    newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
-
-#if 0
-    HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
-#else
-    syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
-                &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
-#endif
-#endif
-
-#endif
-     }
-  }
-  return 0;
-}
diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index e61e8decb..008fcb8cc 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -101,7 +101,12 @@ static FLOAT dm1 = -1.;
 #endif
 
 typedef struct {
-  volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
+#ifdef HAVE_C11
+  _Atomic 
+#else
+  volatile 
+#endif
+  BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
 } job_t;
 
 
@@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
 #elif defined(DOUBLE)
   mode  =  BLAS_DOUBLE  | BLAS_REAL;
   mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
+#elif defined(HALF)
+  mode  =  BLAS_HALF  | BLAS_REAL;
+  mask  = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
 #else
   mode  =  BLAS_SINGLE  | BLAS_REAL;
   mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
diff --git a/openblas_config_template.h b/openblas_config_template.h
index 49aea1cab..9955e5c73 100644
--- a/openblas_config_template.h
+++ b/openblas_config_template.h
@@ -34,6 +34,10 @@ typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif
 
+#ifndef BFLOAT16
+typedef unsigned short bfloat16;
+#endif
+
 #ifdef OPENBLAS_USE64BITINT
 typedef BLASLONG blasint;
 #else
diff --git a/param.h b/param.h
index 9fdf40fe2..1ab982dc5 100644
--- a/param.h
+++ b/param.h
@@ -1748,6 +1748,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef COOPERLAKE
+
+#define SNUMOPT         16
+#define DNUMOPT         8
+
+#define GEMM_DEFAULT_OFFSET_A     0
+#define GEMM_DEFAULT_OFFSET_B     0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SYMV_P  8
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO           8
+#define GEMM_PREFERED_SIZE     8
+#else
+#define SWITCH_RATIO           16
+#define GEMM_PREFERED_SIZE     16
+#endif
+#define USE_SGEMM_KERNEL_DIRECT 1
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#else
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 2
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_MN 32
+#define DGEMM_DEFAULT_UNROLL_MN 32
+#endif
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_R dgemm_r
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_R 1024
+#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define SGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 256
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
+#define XGEMM_DEFAULT_Q 128
+
+#else
+
+#define SGEMM_DEFAULT_P 640
+#define DGEMM_DEFAULT_P 192
+#define CGEMM_DEFAULT_P 384
+#define ZGEMM_DEFAULT_P 256
+
+#define SGEMM_DEFAULT_Q 320
+#define DGEMM_DEFAULT_Q 384
+#define CGEMM_DEFAULT_Q 192
+#define ZGEMM_DEFAULT_Q 128
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R 8640
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+
+#define QGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define XGEMM_DEFAULT_Q 128
+
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
+
+#define CGEMM3M_DEFAULT_P 320
+#define ZGEMM3M_DEFAULT_P 256
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 320
+#define ZGEMM3M_DEFAULT_Q 256
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+#endif
+#endif
 
 
 #ifdef ATOM
@@ -1974,7 +2092,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_M 4
 #define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_M 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
@@ -2225,7 +2343,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_A 0 
 #define GEMM_DEFAULT_OFFSET_B 65536
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
-
+#if defined(__32BIT__)
+#warning using BINARY32==POWER6
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#else
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 16
@@ -2234,7 +2362,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
-
+#endif
 #define SGEMM_DEFAULT_P  1280UL
 #define DGEMM_DEFAULT_P  640UL
 #define CGEMM_DEFAULT_P  640UL
@@ -2254,13 +2382,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_R 4096
 #define DGEMM_DEFAULT_R 4096
 #define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 512
+#define ZGEMM_DEFAULT_R 4096
 
 #define SYMV_P	 8
 
 #endif
 
-#if defined(POWER9)
+#if defined(POWER9) || defined(POWER10)
 
 #define SNUMOPT		16
 #define DNUMOPT		8
@@ -2288,10 +2416,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_Q  1026
 #define ZGEMM_DEFAULT_Q 1026
 
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #define SYMV_P	 8
 
 #endif
 
+#if defined(POWER10)
+#undef SHGEMM_DEFAULT_UNROLL_N
+#undef SHGEMM_DEFAULT_UNROLL_M
+#undef SHGEMM_DEFAULT_P
+#undef SHGEMM_DEFAULT_R
+#undef SHGEMM_DEFAULT_Q
+#define SHGEMM_DEFAULT_UNROLL_M 16
+#define SHGEMM_DEFAULT_UNROLL_N 8
+#define SHGEMM_DEFAULT_P 832
+#define SHGEMM_DEFAULT_Q 1026
+#define SHGEMM_DEFAULT_R 4096
+#endif
+
 #if defined(SPARC) && defined(V7)
 
 #define SNUMOPT		4
@@ -2618,7 +2764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P	16
 
-#if defined(CORTEXA53) || defined(CORTEXA57) || \
+#if defined(CORTEXA57) || \
     defined(CORTEXA72) || defined(CORTEXA73) || \
     defined(FALKOR)    || defined(TSV110) || defined(EMAG8180)
 
@@ -2664,6 +2810,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 2048
 
+#elif defined(CORTEXA53)
+
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 256
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 2048
+
 #elif defined(THUNDERX)
 
 #define SGEMM_DEFAULT_UNROLL_M  4
@@ -2722,6 +2897,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(THUNDERX3T110)
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	320
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #elif defined(NEOVERSEN1)
 
 #define SGEMM_DEFAULT_UNROLL_M  16
@@ -2994,7 +3198,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 
 #define DGEMM_DEFAULT_UNROLL_M  8
@@ -3006,12 +3210,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	456
+#define SGEMM_DEFAULT_P	480
 #define DGEMM_DEFAULT_P	320
 #define CGEMM_DEFAULT_P 480
 #define ZGEMM_DEFAULT_P 224
 
-#define SGEMM_DEFAULT_Q 488
+#define SGEMM_DEFAULT_Q 512
 #define DGEMM_DEFAULT_Q 384
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 352
diff --git a/test/Makefile b/test/Makefile
index 7a873b7e5..45f9821ec 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -64,9 +64,17 @@ endif
 endif
 endif
 
+ifeq ($(BUILD_HALF),1)
+level3 : test_shgemm sblat3 dblat3 cblat3 zblat3
+else
 level3 : sblat3 dblat3 cblat3 zblat3
+endif
 ifndef CROSS
 	rm -f ?BLAT3.SUMM
+ifeq ($(BUILD_HALF),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM
+	@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
+endif
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
 	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat
@@ -78,6 +86,10 @@ ifndef CROSS
 ifdef SMP
 	rm -f ?BLAT3.SUMM
 ifeq ($(USE_OPENMP), 1)
+ifeq ($(BUILD_HALF),1)
+	OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM
+	@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
+endif
 	OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
 	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 	OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
@@ -87,6 +99,10 @@ ifeq ($(USE_OPENMP), 1)
 	OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat
 	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
 else
+ifeq ($(BUILD_HALF),1)
+	OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM
+	@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
+endif
 	OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
 	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 	OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
@@ -165,6 +181,11 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME)
 sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
+ifeq ($(BUILD_HALF),1)
+test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+endif
+
 dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
@@ -187,7 +208,7 @@ clean:
 	@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
 	sblat1 dblat1 cblat1 zblat1 \
 	sblat2 dblat2 cblat2 zblat2 \
-	sblat3 dblat3 cblat3 zblat3 \
+	test_shgemm sblat3 dblat3 cblat3 zblat3 \
 	sblat1p dblat1p cblat1p zblat1p \
 	sblat2p dblat2p cblat2p zblat2p \
 	sblat3p dblat3p cblat3p zblat3p \
diff --git a/test/cblat1.f b/test/cblat1.f
index d6b53d105..ecf2a44cb 100644
--- a/test/cblat1.f
+++ b/test/cblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b CBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT1
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX Level 1 BLAS.
+*>    Based upon the original BLAS test routine together with:
+*>
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT1
-*     Test program for the COMPLEX    Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0),
      +                  (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0),
-     +                  (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0),
+     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0),
+     +                  (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -129,10 +171,10 @@
      +                  (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0),
      +                  (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0),
-     +                  (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0),
-     +                  (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/
-      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/
-      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/
+     +                  (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0),
+     +                  (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/
+      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/
+      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -145,8 +187,8 @@
      +                  (0.11E0,-0.03E0), (-0.17E0,0.46E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.19E0,-0.17E0), (0.32E0,0.09E0),
-     +                  (0.23E0,-0.24E0), (0.18E0,0.01E0),
+     +                  (0.19E0,-0.17E0), (0.20E0,-0.35E0),
+     +                  (0.35E0,0.20E0), (0.14E0,0.08E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
@@ -162,9 +204,9 @@
      +                  (-0.17E0,0.46E0), (4.0E0,7.0E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0),
-     +                  (0.32E0,0.09E0), (6.0E0,9.0E0),
-     +                  (0.23E0,-0.24E0), (8.0E0,3.0E0),
-     +                  (0.18E0,0.01E0), (9.0E0,4.0E0)/
+     +                  (0.20E0,-0.35E0), (6.0E0,9.0E0),
+     +                  (0.35E0,0.20E0), (8.0E0,3.0E0),
+     +                  (0.14E0,0.08E0), (9.0E0,4.0E0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -177,8 +219,8 @@
      +                  (0.03E0,0.03E0), (-0.18E0,0.03E0),
      +                  (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.09E0,0.03E0), (0.03E0,0.12E0),
-     +                  (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0),
+     +                  (0.09E0,0.03E0), (0.15E0,0.00E0),
+     +                  (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -193,8 +235,8 @@
      +                  (-0.18E0,0.03E0), (4.0E0,7.0E0),
      +                  (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0),
-     +                  (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0),
-     +                  (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/
+     +                  (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0),
+     +                  (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
 *     .. Scalar Arguments ..
       REAL             SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/test/cblat2.f b/test/cblat2.f
index 20f188100..8c7bac48e 100644
--- a/test/cblat2.f
+++ b/test/cblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b CBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT2
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'cblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT2
 *
-*  Test program for the COMPLEX          Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'CBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -71,8 +117,8 @@
       PARAMETER          ( NSUBS = 17 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -126,7 +172,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -135,7 +181,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -240,14 +286,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMVCH using exact data.
@@ -3079,7 +3118,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/cblat3.f b/test/cblat3.f
index 5df1ddd64..a65e1364c 100644
--- a/test/cblat3.f
+++ b/test/cblat3.f
@@ -1,50 +1,96 @@
+*> \brief \b CBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT3
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'cblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT3
 *
-*  Test program for the COMPLEX          Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'CBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -53,8 +99,8 @@
       PARAMETER          ( NSUBS = 9 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -103,7 +149,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -112,7 +158,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -189,14 +235,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMMCH using exact data.
@@ -1301,8 +1340,6 @@
       NC = 0
       RESET = .TRUE.
       ERRMAX = RZERO
-      RALS = RONE
-      RBETS = RONE
 *
       DO 100 IN = 1, NIDIM
          N = IDIM( IN )
@@ -1948,7 +1985,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1958,12 +1995,19 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to CSYMM and CHEMM
+*            with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       COMPLEX            ALPHA, BETA
       REAL               RALPHA, RBETA
@@ -1981,6 +2025,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = CMPLX( ONE, -ONE )
+      BETA = CMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2207,16 +2259,16 @@
       CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2274,16 +2326,16 @@
       CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3270,7 +3322,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c
index 978972b24..57aee7b8f 100644
--- a/test/compare_sgemm_shgemm.c
+++ b/test/compare_sgemm_shgemm.c
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include <stdio.h>
 #include <stdint.h>
-#include "common.h"
+#include "../common.h"
 #define SGEMM   BLASFUNC(sgemm)
 #define SHGEMM   BLASFUNC(shgemm)
 typedef union
@@ -46,50 +46,86 @@ typedef union
   } bits;
 } bfloat16_bits;
 
+typedef union
+{
+  float v;
+  struct
+  {
+    uint32_t m:23;
+    uint32_t e:8;
+    uint32_t s:1;
+  } bits;
+} float32_bits;
+
+float
+float16to32 (bfloat16_bits f16)
+{
+  float32_bits f32;
+  f32.bits.s = f16.bits.s;
+  f32.bits.e = f16.bits.e;
+  f32.bits.m = (uint32_t) f16.bits.m << 16;
+  return f32.v;
+}
+
 int
 main (int argc, char *argv[])
 {
   int m, n, k;
   int i, j, l;
+  int x;
   int ret = 0;
-  int loop = 20;
+  int loop = 100;
   char transA = 'N', transB = 'N';
   float alpha = 1.0, beta = 0.0;
-  char transa = 'N';
-  char transb = 'N';
 
-  for (int x = 0; x <= loop; x++)
+  for (x = 0; x <= loop; x++)
     {
       m = k = n = x;
       float A[m * k];
       float B[k * n];
       float C[m * n];
       bfloat16_bits AA[m * k], BB[k * n];
-      float CC[m * n];
+      float DD[m * n], CC[m * n];
 
-      for (int j = 0; j < m; j++)
+      for (j = 0; j < m; j++)
 	{
-	  for (int i = 0; i < m; i++)
+	  for (i = 0; i < m; i++)
 	    {
-	      A[j * k + i] = j * 9.0;
-	      B[j * k + i] = i * 2.0;
+	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
+	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
 	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
 	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
 	      CC[j * k + i] = 0;
+	      DD[j * k + i] = 0;
 	    }
 	}
       SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
-	      &m, B, &k, &beta, C, &m);
+	     &m, B, &k, &beta, C, &m);
       SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
-	       &m, BB, &k, &beta, CC, &m);
-
+	      &m, BB, &k, &beta, CC, &m);
       for (i = 0; i < n; i++)
 	for (j = 0; j < m; j++)
 	  for (l = 0; l < k; l++)
-	    if (CC[i * m + j] != C[i * m + j])
+	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
 	      ret++;
+      if (transA == 'N' && transB == 'N')
+	{
+	  for (i = 0; i < n; i++)
+	    for (j = 0; j < m; j++)
+	      for (l = 0; l < k; l++)
+		{
+		  DD[i * m + j] +=
+		    float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]);
+		}
+	  for (i = 0; i < n; i++)
+	    for (j = 0; j < m; j++)
+	      for (l = 0; l < k; l++)
+		if (CC[i * m + j] != DD[i * m + j])
+		  ret++;
+	}
     }
-  fprintf (stderr, "Return code: %d\n", ret);
+  if (ret != 0)
+    fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);
   return ret;
 }
diff --git a/test/dblat2.f b/test/dblat2.f
index 4002d4368..9bbbe9792 100644
--- a/test/dblat2.f
+++ b/test/dblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b DBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT2
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'dblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETAC
+*> DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT2
 *
-*  Test program for the DOUBLE PRECISION Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'DBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/dblat3.f b/test/dblat3.f
index 082e03e5e..1ebec4ffa 100644
--- a/test/dblat3.f
+++ b/test/dblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b DBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT3
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'dblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT3
 *
-*  Test program for the DOUBLE PRECISION Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'DBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       DOUBLE PRECISION   ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/sblat2.f b/test/sblat2.f
index a1074be52..56ead8640 100644
--- a/test/sblat2.f
+++ b/test/sblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b SBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT2
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'sblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETA
+*> SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT2
 *
-*  Test program for the REAL             Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'SBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/sblat3.f b/test/sblat3.f
index 325a9eb92..66edac14e 100644
--- a/test/sblat3.f
+++ b/test/sblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b SBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT3
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL             Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'sblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT3
 *
-*  Test program for the REAL             Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'SBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       REAL               ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/zblat1.f b/test/zblat1.f
index 8b4b8d21e..2d7b88490 100644
--- a/test/zblat1.f
+++ b/test/zblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b ZBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT1
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX*16 Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT1
-*     Test program for the COMPLEX*16 Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0),
      +                  (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0),
-     +                  (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0),
+     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0),
+     +                  (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -129,10 +171,10 @@
      +                  (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0),
      +                  (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0),
-     +                  (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0),
-     +                  (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/
-      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/
-      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/
+     +                  (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0),
+     +                  (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/
+      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/
+      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -145,8 +187,8 @@
      +                  (0.11D0,-0.03D0), (-0.17D0,0.46D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.19D0,-0.17D0), (0.32D0,0.09D0),
-     +                  (0.23D0,-0.24D0), (0.18D0,0.01D0),
+     +                  (0.19D0,-0.17D0), (0.20D0,-0.35D0),
+     +                  (0.35D0,0.20D0), (0.14D0,0.08D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
@@ -162,9 +204,9 @@
      +                  (-0.17D0,0.46D0), (4.0D0,7.0D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0),
-     +                  (0.32D0,0.09D0), (6.0D0,9.0D0),
-     +                  (0.23D0,-0.24D0), (8.0D0,3.0D0),
-     +                  (0.18D0,0.01D0), (9.0D0,4.0D0)/
+     +                  (0.20D0,-0.35D0), (6.0D0,9.0D0),
+     +                  (0.35D0,0.20D0), (8.0D0,3.0D0),
+     +                  (0.14D0,0.08D0), (9.0D0,4.0D0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -177,8 +219,8 @@
      +                  (0.03D0,0.03D0), (-0.18D0,0.03D0),
      +                  (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.09D0,0.03D0), (0.03D0,0.12D0),
-     +                  (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0),
+     +                  (0.09D0,0.03D0), (0.15D0,0.00D0),
+     +                  (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -193,8 +235,8 @@
      +                  (-0.18D0,0.03D0), (4.0D0,7.0D0),
      +                  (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0),
-     +                  (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0),
-     +                  (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/
+     +                  (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0),
+     +                  (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      DOUBLE PRECISION ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0D0)
 *     .. Scalar Arguments ..
       DOUBLE PRECISION SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/test/zblat2.f b/test/zblat2.f
index e65cdcc70..4a20ac567 100644
--- a/test/zblat2.f
+++ b/test/zblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b ZBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT2
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'zblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT2
 *
-*  Test program for the COMPLEX*16       Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'ZBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -72,8 +118,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -127,7 +173,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -136,7 +182,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -241,14 +287,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMVCH using exact data.
@@ -3087,7 +3126,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/test/zblat3.f b/test/zblat3.f
index f03b1a617..0e38334e9 100644
--- a/test/zblat3.f
+++ b/test/zblat3.f
@@ -1,50 +1,97 @@
+*> \brief \b ZBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT3
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'zblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT3
 *
-*  Test program for the COMPLEX*16       Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.7.0) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'ZBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -54,8 +101,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -104,7 +151,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -113,7 +160,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -190,14 +237,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMMCH using exact data.
@@ -1303,8 +1343,6 @@
       NC = 0
       RESET = .TRUE.
       ERRMAX = RZERO
-      RALS = RONE
-      RBETS = RONE
 *
       DO 100 IN = 1, NIDIM
          N = IDIM( IN )
@@ -1951,7 +1989,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1961,12 +1999,20 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to ZSYMM and ZHEMM
+*            with INFOT = 9  (eca)
+*  10-9-00:  Declared INTRINSIC DCMPLX (susan)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       COMPLEX*16         ALPHA, BETA
       DOUBLE PRECISION   RALPHA, RBETA
@@ -1975,6 +2021,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM,
      $                   ZSYR2K, ZSYRK, ZTRMM, ZTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
 *     .. Common blocks ..
       COMMON             /INFOC/INFOT, NOUTC, OK, LERR
 *     .. Executable Statements ..
@@ -1984,6 +2032,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = DCMPLX( ONE, -ONE )
+      BETA = DCMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2210,16 +2266,16 @@
       CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2277,16 +2333,16 @@
       CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3276,7 +3332,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/utest/Makefile b/utest/Makefile
index 0b9892411..31d4ccf00 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -34,6 +34,9 @@ endif
 ifeq ($(C_COMPILER), PGI)
 OBJS = utest_main2.o
 endif
+ifeq ($(OSNAME), AIX)
+OBJS = utest_main2.o
+endif
 
 all : run_test