Merge branch 'release-0.2.9'

2014-03-06 17:45:31 +08:00 · 2014-03-06 17:45:31 +08:00 · 3e068e78e2
parent 835293cc1a 1140c489c9
commit 3e068e78e2
530 changed files with 144243 additions and 8380 deletions
--- a/BACKERS.md
+++ b/BACKERS.md
@ -0,0 +1,39 @@
+Thank you for the support.
+
+### [2013.8] [Testbed for OpenBLAS project](https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project)
+
+https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project/pledges
+
+In chronological order:
+
+* aeberspaecher
+* fmolina
+* saullocastro
+* xianyi
+* cuda
+* carter
+* StefanKarpinski
+* staticfloat
+* sebastien-villemot
+* JeffBezanson
+* ihnorton
+* simonp0420
+* andrioni
+* Tim Holy
+* ivarne
+* johnmyleswhite
+* traz
+* Jean-Francis Roy
+* bkalpert
+* Anirban
+* pgermain
+* alexandre.lacoste.18
+* foges
+* ssam
+* WestleyArgentum
+* daniebmariani
+* pjpuglia
+* albarrentine
+* Alexander Vogt
+
+
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -10,13 +10,26 @@
  * Optimize BLAS3 on ICT Loongson 3A.
  * Optimize BLAS3 on Intel Sandy Bridge.

+* Werner Saar <wernsaar@googlemail.com>
+  * [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
+  * [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
+  * [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
+  * [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
+  * [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
+  * [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
+  * [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
+  * [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
+  * Porting and Optimization on ARM Cortex-A9
+  * Optimization on AMD Piledriver
+  * Optimization on Intel Haswell
+
+## Previous Developers
+
 * Zaheer Chothia <zaheer.chothia@gmail.com>
  * Improve the compatibility about complex number
  * Build LAPACKE: C interface to LAPACK
  * Improve the windows build.

-## Previous Developers
-
 * Chen Shaohu <huhumartinwar@gmail.com>
  * Optimize GEMV on the Loongson 3A processor. 

@ -52,16 +65,7 @@ In chronological order:

 * Sébastien Villemot <sebastien@debian.org>
  * [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.
-
-* Werner Saar <wernsaar@googlemail.com>
-  * [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
-  * [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
-  * [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
-  * [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
-  * [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
-  * [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
-  * [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
-  * [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
+  * [2013-08-28] Avoid failure on qemu guests declaring an Athlon CPU without 3dnow!

 * Kang-Che Sung <Explorer09@gmail.com>
  * [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.
@ -79,9 +83,36 @@ In chronological order:
  * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization 
    model is used by OpenBLAS.

+* Elliot Saba <staticfloat@gmail.com>
+  * [2013-07-22] Add in return value for `interface/trtri.c`
+
 * Sébastien Fabbro <bicatali@gentoo.org>
  * [2013-07-24] Modify makefile to respect user's LDFLAGS
  * [2013-07-24] Add stack markings for GNU as arch-independent for assembler files

+* Viral B. Shah <viral@mayin.org>
+  * [2013-08-21] Patch LAPACK XLASD4.f as discussed in JuliaLang/julia#2340
+
+* Lars Buitinck <https://github.com/larsmans>
+  * [2013-08-28] get rid of the generated cblas_noconst.h file
+  * [2013-08-28] Missing threshold in gemm.c
+  * [2013-08-28] fix default prefix handling in makefiles
+
+* yieldthought <https://github.com/yieldthought>
+  * [2013-10-08] Remove -Wl,--retain-symbols-file from dynamic link line to fix tool support
+
+* Keno Fischer <https://github.com/loladiro>
+  * [2013-10-23] Use FC instead of CC to link the dynamic library on OS X
+
+* Christopher Meng <cickumqt@gmail.com>
+  * [2013-12-09] Add DESTDIR support for easier building on RPM based distros.
+                 Use install command instead of cp to install files with permissions control.
+
+* Lucas Beyer <lucasb.eyer.be@gmail.com>
+  * [2013-12-10] Added support for NO_SHARED in make install.
+
+* carlkl <https://github.com/carlkl>
+  * [2013-12-13] Fixed LAPACKE building bug on Windows
+
 * [Your name or handle] <[email or website]>
  * [Date] [Brief summary of your changes]
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,31 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.2.9.rc2
+06-Mar-2014
+common:
+	* Added OPENBLAS_VERBOSE environment variable.(#338)
+	* Make OpenBLAS thread-pool resilient to fork via pthread_atfork.
+	  (#294, Thank Olivier Grisel)
+	* Rewrote rotmg
+	* Fixed sdsdot bug.
+x86/x86-64:
+	* Detect Intel Haswell for new Macbook.
+
+====================================================================
+Version 0.2.9.rc1
+13-Jan-2013
+common:
+	* Update LAPACK to 3.5.0 version
+	* Fixed compatiable issues with Clang and Pathscale compilers.
+
+x86/x86-64:
+	* Optimization on Intel Haswell.
+	* Enable optimization kernels on AMD Bulldozer and Piledriver.
+
+ARM:
+	* Support ARMv6 and ARMv7 ISA.
+	* Optimization on ARM Cortex-A9.
+
 ====================================================================
 Version 0.2.8
 01-Aug-2013
--- a/18
+++ b/18
@ -15,10 +15,6 @@ ifdef SANITY_CHECK
 BLASDIRS += reference
 endif

-ifndef PREFIX
-PREFIX = /opt/OpenBLAS
-endif
-
 SUBDIRS	= $(BLASDIRS)
 ifneq ($(NO_LAPACK), 1)
 SUBDIRS	+= lapack
@ -31,7 +27,7 @@ SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

 all :: libs netlib tests shared
 	@echo
-	@echo " OpenBLAS build complete."
+	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
 	@echo "  OS               ... $(OSNAME)             "
 	@echo "  Architecture     ... $(ARCH)               "
@ -44,7 +40,9 @@ ifdef INTERFACE64
 	@echo "  Use 64 bits int    (equivalent to \"-i8\" in Fortran)      "
 endif
 	@echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))"
+ifndef NOFORTRAN
 	@echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))"
+endif
 ifneq ($(OSNAME), AIX)
 	@echo -n "  Library Name     ... $(LIBNAME)"
 else
@ -221,10 +219,10 @@ prof_lapack : lapack_prebuild
 lapack_prebuild :
 ifndef NOFORTRAN
 	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "OPTS        = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "POPTS       = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "NOOPT       = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "PNOOPT      = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "NOOPT       = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -320,7 +318,7 @@ clean ::
 ifeq ($(OSNAME), Darwin)
 	@rm -rf getarch.dSYM getarch_2nd.dSYM
 endif
-	@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
+	@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
--- a/Makefile.arm
+++ b/Makefile.arm
@ -0,0 +1,12 @@
+
+ifeq ($(CORE), ARMV7)
+CCOMMON_OPT += -marm -mfpu=vfpv3  -mfloat-abi=hard -march=armv7-a
+FCOMMON_OPT += -marm -mfpu=vfpv3  -mfloat-abi=hard -march=armv7-a
+endif
+
+ifeq ($(CORE), ARMV6)
+CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard  -march=armv6
+FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard  -march=armv6
+endif
+
+
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -0,0 +1,7 @@
+
+ifeq ($(CORE), ARMV8)
+CCOMMON_OPT += -march=armv8-a
+FCOMMON_OPT += -march=armv8-a
+endif
+
+
--- a/Makefile.install
+++ b/Makefile.install
@ -3,9 +3,11 @@ export GOTOBLAS_MAKEFILE = 1
 -include $(TOPDIR)/Makefile.conf_last
 include ./Makefile.system

-OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
-OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
-OPENBLAS_BUILD_DIR:=$(CURDIR)
+PREFIX ?= /opt/OpenBLAS
+
+OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
+OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
+OPENBLAS_BUILD_DIR := $(CURDIR)

 .PHONY : install
 .NOTPARALLEL : install
@ -14,71 +16,73 @@ lib.grd :
 	$(error OpenBLAS: Please run "make" firstly)

 install : 	lib.grd
-	@-mkdir -p $(PREFIX)
-	@-mkdir -p $(OPENBLAS_INCLUDE_DIR)
-	@-mkdir -p $(OPENBLAS_LIBRARY_DIR)
-	@echo Generating openblas_config.h in $(OPENBLAS_INCLUDE_DIR)
+	@-mkdir -p $(DESTDIR)$(PREFIX)
+	@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
+	@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
 #for inc 
-	@echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
-	@echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
-	@cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
-	@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
-	@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
-	@echo \#endif  \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+	@echo \#endif  \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h

-	@echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR)
-	@echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h
-	@echo \#define OPENBLAS_F77BLAS_H >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
-	@echo \#include \"openblas_config.h\" >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
-	@cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
-	@echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
+	@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
+	@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+	@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+	@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+	@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+	@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h

 ifndef NO_CBLAS
-	@echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR)
-	@sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h
+	@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
+	@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
 endif

 ifndef NO_LAPACKE
-	@echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR)
-	@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h
-	@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
-	@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
-	@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
+	@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
+	@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
+	@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
+	@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
 endif

 #for install static library 
-	@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
-	@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
-	@cd $(OPENBLAS_LIBRARY_DIR) ; \
+	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
 	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 #for install shared library 
-	@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
+ifndef NO_SHARED
+	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 ifeq ($(OSNAME), Linux)
-	@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-	@cd $(OPENBLAS_LIBRARY_DIR) ; \
+	@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 ifeq ($(OSNAME), FreeBSD)
-	@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-	@cd $(OPENBLAS_LIBRARY_DIR) ; \
+	@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), NetBSD)
-	@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-	@cd $(OPENBLAS_LIBRARY_DIR) ; \
+	@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)     
-	@-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
-	@-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-	@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
+	@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+	@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
+	@-ln -fs $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 	@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
+endif
 endif

 	@echo Install OK!
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@ -17,13 +17,23 @@ ifdef CPUIDEMU
 EXFLAGS = -DCPUIDEMU -DVENDOR=99
 endif

-all: getarch_2nd cblas_noconst.h
+all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)

 config.h : c_check f_check getarch
 	perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC)
+ifneq ($(ONLY_CBLAS), 1)
 	perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC)
+else
+#When we only build CBLAS, we set NOFORTRAN=2
+	echo "NOFORTRAN=2" >> $(TARGET_MAKE)
+	echo "NO_FBLAS=1" >> $(TARGET_MAKE)
+	echo "F_COMPILER=GFORTRAN"  >> $(TARGET_MAKE)
+	echo "BU=_"  >> $(TARGET_MAKE)
+	echo "#define BUNDERSCORE _" >> $(TARGET_CONF)
+	echo "#define NEEDBUNDERSCORE 1" >> $(TARGET_CONF)
+endif
 	./getarch 0 >> $(TARGET_MAKE)
 	./getarch 1 >> $(TARGET_CONF)

@ -38,7 +48,4 @@ else
 	$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
 endif

-cblas_noconst.h : cblas.h
-	perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h	
-
 dummy:
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.2.8
+VERSION = 0.2.9.rc2

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library 
@ -54,6 +54,10 @@ VERSION = 0.2.8
 # If you don't need CBLAS interface, please comment it in.
 # NO_CBLAS = 1

+# If you only want CBLAS interface without installing Fortran compiler, 
+# please comment it in.
+# ONLY_CBLAS = 1
+
 # If you don't need LAPACK, please comment it in. 
 # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
 # NO_LAPACK = 1
--- a/Makefile.system
+++ b/Makefile.system
@ -82,12 +82,19 @@ ifeq ($(HOSTCC), loongcc)
 GETARCH_FLAGS  += -static
 endif

+#if don't use Fortran, it will only compile CBLAS.
+ifeq ($(ONLY_CBLAS), 1)
+NO_LAPACK = 1
+else
+ONLY_CBLAS = 0
+endif
+
 # This operation is expensive, so execution should be once.
 ifndef GOTOBLAS_MAKEFILE
 export GOTOBLAS_MAKEFILE = 1

 # Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
+DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all)

 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
@ -222,6 +229,11 @@ endif
 endif
 endif

+# ifeq logical or
+ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
+OS_WINDOWS=1
+endif
+
 ifdef QUAD_PRECISION
 CCOMMON_OPT	+= -DQUAD_PRECISION
 NO_EXPRECISION = 1
@ -324,16 +336,14 @@ ifeq ($(ARCH), x86)
 DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 	       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE 
-#BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
 endif
 endif

 ifeq ($(ARCH), x86_64)
 DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE 
-#BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
 endif
 endif

@ -363,6 +373,19 @@ NO_BINARY_MODE	= 1
 BINARY_DEFINED	= 1
 endif

+ifeq ($(ARCH), arm)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
+
+ifeq ($(ARCH), arm64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
+
+
+
+
 #
 #  C Compiler dependent settings
 #
@ -465,10 +488,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
 FCOMMON_OPT += -Wall
 #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
 ifneq ($(NO_LAPACK), 1)
-ifneq ($(C_COMPILER), LSB)
 EXTRALIB += -lgfortran 
 endif
-endif
 ifdef NO_BINARY_MODE
 ifeq ($(ARCH), mips64)
 ifdef BINARY64
@ -825,6 +846,19 @@ ifeq ($(DEBUG), 1)
 COMMON_OPT += -g
 endif

+ifndef COMMON_OPT
+ifeq ($(ARCH), arm)
+COMMON_OPT = -O3
+endif
+endif
+
+ifndef COMMON_OPT
+ifeq ($(ARCH), arm64)
+COMMON_OPT = -O3
+endif
+endif
+
+
 ifndef COMMON_OPT
 COMMON_OPT = -O2
 endif
@ -837,11 +871,24 @@ override FFLAGS     += $(COMMON_OPT) $(FCOMMON_OPT)
 override FPFLAGS    += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
 #MAKEOVERRIDES =

+#For LAPACK Fortran codes.
+#Disable -fopenmp for LAPACK Fortran codes on Windows.
+ifdef OS_WINDOWS
+LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
+LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
+else
+LAPACK_FFLAGS := $(FFLAGS)
+LAPACK_FPFLAGS := $(FPFLAGS)
+endif
+
 LAPACK_CFLAGS = $(CFLAGS)
 LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H 
 ifdef INTERFACE64
 LAPACK_CFLAGS +=  -DLAPACK_ILP64
 endif
+ifdef OS_WINDOWS
+LAPACK_CFLAGS +=  -DOPENBLAS_OS_WINDOWS
+endif
 ifeq ($(C_COMPILER), LSB)
 LAPACK_CFLAGS +=  -DLAPACK_COMPLEX_STRUCTURE
 endif
@ -887,6 +934,23 @@ LIBZIPNAME   = $(LIBNAME:.$(LIBSUFFIX)=.zip)
 LIBS		= $(TOPDIR)/$(LIBNAME)
 LIBS_P		= $(TOPDIR)/$(LIBNAME_P)

+
+LIB_COMPONENTS = BLAS
+ifneq ($(NO_CBLAS), 1)
+LIB_COMPONENTS += CBLAS
+endif
+
+ifneq ($(NO_LAPACK), 1)
+LIB_COMPONENTS += LAPACK
+ifneq ($(NO_LAPACKE), 1)
+LIB_COMPONENTS += LAPACKE
+endif
+endif
+
+ifeq ($(ONLY_CBLAS), 1)
+LIB_COMPONENTS = CBLAS
+endif
+
 export OSNAME
 export ARCH
 export CORE
@ -913,6 +977,7 @@ export USE_OPENMP
 export CROSS
 export CROSS_SUFFIX
 export NOFORTRAN
+export NO_FBLAS
 export EXTRALIB
 export CEXTRALIB
 export FEXTRALIB
@ -925,6 +990,10 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
+export HAVE_VFP
+export HAVE_VFPV3
+export HAVE_VFPV4
+export HAVE_NEON
 export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
--- a/Makefile.tail
+++ b/Makefile.tail
@ -606,7 +606,8 @@ clean ::
 	@if test -d $(ARCH); then \
 	(cd $(ARCH) && $(MAKE) clean) \
 	fi
-	@rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \
+	@find . -name '*.o' | xargs rm -rf
+	@rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \
 	*.csx *.is *~ *.exe *.flame *.pdb *.dwf \
 	gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \
 	*.pc *.pcl *.def *.i *.prof linktest.c \
--- a/README.md
+++ b/README.md
@ -126,3 +126,5 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
 1. Write a test which shows that the bug was fixed or that the feature works as expected.
 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.

+## Donation
+Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
--- a/4
+++ b/4
@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
 $architecture = alpha  if ($data =~ /ARCH_ALPHA/);
 $architecture = sparc  if ($data =~ /ARCH_SPARC/);
 $architecture = ia64   if ($data =~ /ARCH_IA64/);
+$architecture = arm    if ($data =~ /ARCH_ARM/);
+$architecture = arm64  if ($data =~ /ARCH_ARM64/);

 $defined = 0;

@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
 $architecture = alpha  if ($data =~ /ARCH_ALPHA/);
 $architecture = sparc  if ($data =~ /ARCH_SPARC/);
 $architecture = ia64   if ($data =~ /ARCH_IA64/);
+$architecture = arm    if ($data =~ /ARCH_ARM/);
+$architecture = arm64  if ($data =~ /ARCH_ARM64/);

 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
--- a/cblas.h
+++ b/cblas.h
@ -26,6 +26,15 @@ int openblas_get_parallel(void);
 #define OPENBLAS_OPENMP 2 


+/*
+ * Since all of GotoBlas was written without const,
+ * we disable it at build time.
+ */
+#ifndef OPENBLAS_CONST
+# define OPENBLAS_CONST const
+#endif
+
+
 #define CBLAS_INDEX size_t

 typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
@ -34,265 +43,265 @@ typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
 typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
 typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;

-float  cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
-double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
-float  cblas_sdot(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
-double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
+float  cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
+double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
+float  cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);

-openblas_complex_float  cblas_cdotu(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
-openblas_complex_float  cblas_cdotc(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
-openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
-openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
+openblas_complex_float  cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+openblas_complex_float  cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
+openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);
+openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);

-void  cblas_cdotu_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret);
-void  cblas_cdotc_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret);
-void  cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
-void  cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
+void  cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy, openblas_complex_float  *ret);
+void  cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy, openblas_complex_float  *ret);
+void  cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret);
+void  cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret);

-float  cblas_sasum (const blasint n, const float  *x, const blasint incx);
-double cblas_dasum (const blasint n, const double *x, const blasint incx);
-float  cblas_scasum(const blasint n, const float  *x, const blasint incx);
-double cblas_dzasum(const blasint n, const double *x, const blasint incx);
+float  cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);

-float  cblas_snrm2 (const blasint N, const float  *X, const blasint incX);
-double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
-float  cblas_scnrm2(const blasint N, const float  *X, const blasint incX);
-double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
+float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
+double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
+float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
+double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);

-CBLAS_INDEX cblas_isamax(const blasint n, const float  *x, const blasint incx);
-CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
-CBLAS_INDEX cblas_icamax(const blasint n, const float  *x, const blasint incx);
-CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
+CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);

-void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
-void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
-void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
-void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
+void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);

-void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
-void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
-void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
-void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
+void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);

-void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
-void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
-void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
-void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
+void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
+void cblas_cswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
+void cblas_zswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);

-void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
-void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double  s);
+void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
+void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double  s);

 void cblas_srotg(float *a, float *b, float *c, float *s);
 void cblas_drotg(double *a, double *b, double *c, double *s);

-void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
-void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
+void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
+void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);

-void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
-void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P);
+void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P);

-void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
-void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
-void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
-void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
-void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
-void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
+void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
+void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, double *X, OPENBLAS_CONST blasint incX);
+void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
+void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);

-void cblas_sgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
-		 const float alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float beta,  float  *y, const blasint incy);
-void cblas_dgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
-		 const double alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double beta,  double  *y, const blasint incy);
-void cblas_cgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
-		 const float *alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float *beta,  float  *y, const blasint incy);
-void cblas_zgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
-		 const double *alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double *beta,  double  *y, const blasint incy);
+void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy);
+void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double beta,  double  *y, OPENBLAS_CONST blasint incy);
+void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float *beta,  float  *y, OPENBLAS_CONST blasint incy);
+void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double *beta,  double  *y, OPENBLAS_CONST blasint incy);

-void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float   alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
-void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double  alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
-void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
-void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
-void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
-void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
+void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float   alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double  alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
+void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);

-void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
-void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);

-void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
-void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);

-void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
-void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
-void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
-void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
+void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
+void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
+void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);

-void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
-                const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
-void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
-                const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
-void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
-                const float *Y, const blasint incY, float *A, const blasint lda);
-void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
-                const double *Y, const blasint incY, double *A, const blasint lda);
+void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X,
+                OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
+void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,
+                OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
+void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX,
+                OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
+void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX,
+                OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);

-void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
-                 const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
-void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
-                 const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
-void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
-                 const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
-void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
-                 const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);

-void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
-                 const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
-void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
-                 const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);


-void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
-void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);

-void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
-void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
-void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);

-void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const float *Ap, float *X, const blasint incX);
-void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const double *Ap, double *X, const blasint incX);
-void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const float *Ap, float *X, const blasint incX);
-void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const double *Ap, double *X, const blasint incX);
+void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);

-void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const float *Ap, float *X, const blasint incX);
-void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const double *Ap, double *X, const blasint incX);
-void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const float *Ap, float *X, const blasint incX);
-void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const blasint N, const double *Ap, double *X, const blasint incX);
+void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
+void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
+void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
+                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);

-void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
-                 const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
-void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
-                 const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
-void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
-                 const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
-void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
-                 const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
+void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);


-void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
-                 const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
-void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
-                 const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap,
+                 OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap,
+                 OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);

-void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
-void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
+void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap);
+void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap);

-void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
-void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
+void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A);
+void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,OPENBLAS_CONST blasint incX, double *A);

-void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
-void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
-void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
-void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
+void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A);
+void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A);
+void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *Ap);
+void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *Ap);

-void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
-		 const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
-void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
-		 const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);

-void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
-		 const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
-void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
-		 const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *Ap, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY);
+void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *Ap, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY);

-void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
-		 const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
-void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
-		 const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
-void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
-		 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
-void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
-		 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
-void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
-void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
-void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		 const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
-void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		 const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
-void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		 const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
-void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		 const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
+void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		  const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
-void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		  const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
-void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		  const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
-void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
-		  const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
+		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
-void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
-void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
-void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);

-void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
-void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
-void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
-void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
+void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
+void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);

-void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
-void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
-                 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                 OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
-                 const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
-void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
-                 const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
+void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);

-void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
-                  const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
-void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
-                  const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
+void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                  OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+                  OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);

 void cblas_xerbla(blasint p, char *rout, char *form, ...);

--- a/cblas_noconst.h
+++ b/cblas_noconst.h
@ -0,0 +1,303 @@
+#ifndef CBLAS_H
+#define CBLAS_H
+
+#include <stddef.h>
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+	/* Assume C declarations for C++ */
+#endif  /* __cplusplus */
+
+/*Set the number of threads on runtime.*/
+void openblas_set_num_threads(int num_threads);
+void goto_set_num_threads(int num_threads);
+
+/*Get the build configure on runtime.*/
+char* openblas_get_config(void);
+
+/* Get the parallelization type which is used by OpenBLAS */
+int openblas_get_parallel(void); 
+/* OpenBLAS is compiled for sequential use  */
+#define OPENBLAS_SEQUENTIAL  0
+/* OpenBLAS is compiled using normal threading model */
+#define OPENBLAS_THREAD  1 
+/* OpenBLAS is compiled using OpenMP threading model */
+#define OPENBLAS_OPENMP 2 
+
+
+#define CBLAS_INDEX size_t
+
+typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
+typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
+typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
+typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
+typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
+
+float  cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
+double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
+float  cblas_sdot(blasint n, float  *x, blasint incx, float  *y, blasint incy);
+double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
+
+openblas_complex_float  cblas_cdotu(blasint n, float  *x, blasint incx, float  *y, blasint incy);
+openblas_complex_float  cblas_cdotc(blasint n, float  *x, blasint incx, float  *y, blasint incy);
+openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
+openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
+
+void  cblas_cdotu_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
+void  cblas_cdotc_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
+void  cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
+void  cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
+
+float  cblas_sasum (blasint n, float  *x, blasint incx);
+double cblas_dasum (blasint n, double *x, blasint incx);
+float  cblas_scasum(blasint n, float  *x, blasint incx);
+double cblas_dzasum(blasint n, double *x, blasint incx);
+
+float  cblas_snrm2 (blasint N, float  *X, blasint incX);
+double cblas_dnrm2 (blasint N, double *X, blasint incX);
+float  cblas_scnrm2(blasint N, float  *X, blasint incX);
+double cblas_dznrm2(blasint N, double *X, blasint incX);
+
+CBLAS_INDEX cblas_isamax(blasint n, float  *x, blasint incx);
+CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
+CBLAS_INDEX cblas_icamax(blasint n, float  *x, blasint incx);
+CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
+
+void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
+void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
+void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
+void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);
+
+void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
+void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
+void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
+void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
+
+void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
+void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
+void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
+void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
+
+void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
+void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double  s);
+
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_drotg(double *a, double *b, double *c, double *s);
+
+void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
+void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
+
+void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
+void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
+
+void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
+void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
+void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
+void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
+void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
+void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
+
+void cblas_sgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
+		 float alpha, float  *a, blasint lda,  float  *x, blasint incx,  float beta,  float  *y, blasint incy);
+void cblas_dgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
+		 double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy);
+void cblas_cgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
+		 float *alpha, float  *a, blasint lda,  float  *x, blasint incx,  float *beta,  float  *y, blasint incy);
+void cblas_zgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
+		 double *alpha, double  *a, blasint lda,  double  *x, blasint incx,  double *beta,  double  *y, blasint incy);
+
+void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float   alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
+void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double  alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
+void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
+void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
+void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
+void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
+
+void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
+void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
+void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
+void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
+
+void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
+void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
+void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
+void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
+
+void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
+void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
+void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
+void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
+
+void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
+                blasint incX, float *Y, blasint incY, float *A, blasint lda);
+void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
+                blasint incX, double *Y, blasint incY, double *A, blasint lda);
+void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
+                float *Y, blasint incY, float *A, blasint lda);
+void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
+                double *Y, blasint incY, double *A, blasint lda);
+
+void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
+                 blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
+void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
+                 blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
+void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
+                 blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
+void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
+                 blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
+
+void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
+                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
+void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
+                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
+
+
+void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
+void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
+void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
+void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
+
+void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
+void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
+void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
+void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
+
+void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, float *Ap, float *X, blasint incX);
+void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, double *Ap, double *X, blasint incX);
+void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, float *Ap, float *X, blasint incX);
+void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, double *Ap, double *X, blasint incX);
+
+void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, float *Ap, float *X, blasint incX);
+void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, double *Ap, double *X, blasint incX);
+void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, float *Ap, float *X, blasint incX);
+void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
+                 blasint N, double *Ap, double *X, blasint incX);
+
+void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
+                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
+void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
+                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
+void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
+                 blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
+void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
+                 blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
+
+
+void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
+                 float *X, blasint incX, float beta, float *Y, blasint incY);
+void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
+                 double *X, blasint incX, double beta, double *Y, blasint incY);
+
+void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
+void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
+
+void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
+void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
+
+void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
+void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
+void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
+void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
+
+void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
+		 float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
+void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
+		 double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
+
+void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
+		 float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
+void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
+		 double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
+
+void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+		 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
+void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+		 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
+void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+		 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
+void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+		 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
+
+void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
+void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
+void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
+void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
+
+void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		 blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
+void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		 blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
+void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		 blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
+void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		 blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
+
+void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		  blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
+void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		  blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
+void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		  blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
+void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
+		  blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
+
+void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
+void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
+void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
+void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
+
+void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
+void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
+void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
+void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
+                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
+
+void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
+void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
+                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
+
+void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
+                 float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
+void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
+                 double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
+
+void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
+                  float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
+void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
+                  double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
+
+void cblas_xerbla(blasint p, char *rout, char *form, ...);
+
+#ifdef __cplusplus
+}
+#endif  /* __cplusplus */
+
+#endif
--- a/common.h
+++ b/common.h
@ -310,6 +310,16 @@ typedef int blasint;
 #define YIELDING	SwitchToThread()
 #endif

+#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8)
+#define YIELDING        asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
+#endif
+
+#ifdef PILEDRIVER
+#ifndef YIELDING
+#define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
+#endif
+#endif
+
 #ifndef YIELDING
 #define YIELDING	sched_yield()
 #endif
@ -363,6 +373,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips64.h"
 #endif

+#ifdef ARCH_ARM
+#include "common_arm.h"
+#endif
+
+#ifdef ARCH_ARM64
+#include "common_arm64.h"
+#endif
+
+
 #ifdef OS_LINUX
 #include "common_linux.h"
 #endif
@ -574,9 +593,10 @@ typedef struct {
 #include "common_level2.h"
 #include "common_level3.h"
 #include "common_lapack.h"
+
 #ifdef CBLAS
-/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
-#include "cblas_noconst.h"
+# define OPENBLAS_CONST     /* see comment in cblas.h */
+# include "cblas.h"
 #endif

 #ifndef ASSEMBLER
--- a/common_alpha.h
+++ b/common_alpha.h
@ -151,7 +151,7 @@ REALNAME:
 #endif

 #if defined(__linux__) && defined(__ELF__)
-#define GNUSTACK .section .note.GNU-stack,"",%progbits
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
 #else
 #define GNUSTACK
 #endif
--- a/common_arm.h
+++ b/common_arm.h
@ -0,0 +1,169 @@
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the ISCAS nor the names of its contributors may 
+      be used to endorse or promote products derived from this software 
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_ARM
+#define COMMON_ARM
+
+#define MB
+#define WMB
+
+#define INLINE inline
+
+#define RETURN_BY_COMPLEX
+
+#ifndef ASSEMBLER
+
+static void __inline blas_lock(volatile BLASULONG *address){
+
+  int register ret;
+
+  do {
+    while (*address) {YIELDING;};
+
+    __asm__ __volatile__(
+                         "ldrex r2, [%1]                                                \n\t"
+                         "mov   r2, #0                                                  \n\t"
+                         "strex r3, r2, [%1]                                            \n\t"
+			 "mov	%0 , r3							\n\t"
+                         : "=r"(ret), "=r"(address)
+                         : "1"(address)
+                         : "memory", "r2" , "r3" 
+
+
+    );
+
+  } while (ret);
+
+}
+
+
+static inline unsigned long long rpcc(void){
+  unsigned long long ret=0;
+  double v;
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
+  ret = (unsigned long long) ( v * 1000.0d );
+  return ret;
+}
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#if defined(DOUBLE)
+#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#else
+#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#endif
+
+#define GET_IMAGE_CANCEL
+
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+	.arm		 ;\
+	.global	REALNAME ;\
+	.func	REALNAME  ;\
+REALNAME:
+
+#define EPILOGUE 
+
+#define PROFCODE
+
+#endif
+
+
+#define SEEK_ADDRESS
+
+#ifndef PAGESIZE
+#define PAGESIZE        ( 4 << 10)
+#endif
+#define HUGE_PAGESIZE   ( 4 << 20)
+
+#define BUFFER_SIZE     (16 << 20)
+
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
--- a/common_arm64.h
+++ b/common_arm64.h
@ -0,0 +1,169 @@
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the ISCAS nor the names of its contributors may 
+      be used to endorse or promote products derived from this software 
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_ARM64
+#define COMMON_ARM64
+
+#define MB
+#define WMB
+
+#define INLINE inline
+
+#define RETURN_BY_COMPLEX
+
+#ifndef ASSEMBLER
+
+static void __inline blas_lock(volatile BLASULONG *address){
+/*
+  int register ret;
+
+  do {
+    while (*address) {YIELDING;};
+
+    __asm__ __volatile__(
+                         "ldrex r2, [%1]                                                \n\t"
+                         "mov   r2, #0                                                  \n\t"
+                         "strex r3, r2, [%1]                                            \n\t"
+			 "mov	%0 , r3							\n\t"
+                         : "=r"(ret), "=r"(address)
+                         : "1"(address)
+                         : "memory", "r2" , "r3" 
+
+
+    );
+
+  } while (ret);
+*/
+}
+
+
+static inline unsigned long long rpcc(void){
+  unsigned long long ret=0;
+  double v;
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
+  ret = (unsigned long long) ( v * 1000.0d );
+  return ret;
+}
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#if defined(DOUBLE)
+#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#else
+#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#endif
+
+#define GET_IMAGE_CANCEL
+
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+	.arm		 ;\
+	.global	REALNAME ;\
+	.func	REALNAME  ;\
+REALNAME:
+
+#define EPILOGUE 
+
+#define PROFCODE
+
+#endif
+
+
+#define SEEK_ADDRESS
+
+#ifndef PAGESIZE
+#define PAGESIZE        ( 4 << 10)
+#endif
+#define HUGE_PAGESIZE   ( 4 << 20)
+
+#define BUFFER_SIZE     (16 << 20)
+
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
--- a/common_ia64.h
+++ b/common_ia64.h
@ -380,7 +380,7 @@ REALNAME:
 #endif

 #if defined(__linux__) && defined(__ELF__)
-#define GNUSTACK .section .note.GNU-stack,"",%progbits
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
 #else
 #define GNUSTACK
 #endif
--- a/common_mips64.h
+++ b/common_mips64.h
@ -236,7 +236,7 @@ REALNAME: ;\
 	.set	nomacro

 #if defined(__linux__) && defined(__ELF__)
-#define GNUSTACK .section .note.GNU-stack,"",%progbits
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
 #else
 #define GNUSTACK
 #endif
--- a/common_s.h
+++ b/common_s.h
@ -17,7 +17,7 @@
 #define	SCOPY_K			scopy_k
 #define	SDOTU_K			sdot_k
 #define	SDOTC_K			sdot_k
-#define	SDSDOT_K		sdot_k
+#define	SDSDOT_K		dsdot_k
 #define	DSDOT_K			dsdot_k
 #define	SNRM2_K			snrm2_k
 #define	SSCAL_K			sscal_k
@ -162,7 +162,7 @@
 #define	SCOPY_K			gotoblas -> scopy_k
 #define	SDOTU_K			gotoblas -> sdot_k
 #define	SDOTC_K			gotoblas -> sdot_k
-#define	SDSDOT_K		gotoblas -> sdot_k
+#define	SDSDOT_K		gotoblas -> dsdot_k
 #define	DSDOT_K			gotoblas -> dsdot_k
 #define	SNRM2_K			gotoblas -> snrm2_k
 #define	SSCAL_K			gotoblas -> sscal_k
--- a/common_sparc.h
+++ b/common_sparc.h
@ -201,7 +201,7 @@ static __inline int blas_quickdivide(blasint x, blasint y){
 REALNAME:;

 #if defined(__linux__) && defined(__ELF__)
-#define GNUSTACK .section .note.GNU-stack,"",%progbits
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
 #else
 #define GNUSTACK
 #endif
--- a/common_thread.h
+++ b/common_thread.h
@ -103,7 +103,7 @@ typedef struct blas_queue {

  struct blas_queue *next;

-#if defined( __WIN32__) || defined(__CYGWIN32__)
+#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
  CRITICAL_SECTION lock;
  HANDLE finish;
 #else
--- a/common_x86.h
+++ b/common_x86.h
@ -303,7 +303,7 @@ REALNAME:

 #define EPILOGUE \
        .size	 REALNAME, .-REALNAME; \
-        .section .note.GNU-stack,"",%progbits
+        .section .note.GNU-stack,"",@progbits

 #endif

--- a/common_x86_64.h
+++ b/common_x86_64.h
@ -374,7 +374,7 @@ REALNAME:

 #define EPILOGUE \
        .size	 REALNAME, .-REALNAME; \
-        .section .note.GNU-stack,"",%progbits
+        .section .note.GNU-stack,"",@progbits


 #endif
--- a/cpuid.h
+++ b/cpuid.h
@ -105,9 +105,9 @@
 #define CORE_NANO	19
 #define CORE_SANDYBRIDGE 20
 #define CORE_BOBCAT     21
-#define CORE_BULLDOZER CORE_BARCELONA
-#define CORE_PILEDRIVER CORE_BARCELONA
-#define CORE_HASWELL CORE_SANDYBRIDGE
+#define CORE_BULLDOZER  22
+#define CORE_PILEDRIVER  23
+#define CORE_HASWELL 24

 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@ -198,9 +198,8 @@ typedef struct {
 #define CPUTYPE_NANO			43
 #define CPUTYPE_SANDYBRIDGE             44
 #define CPUTYPE_BOBCAT                  45
-#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
-#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
-// this define is because BLAS doesn't have haswell specific optimizations yet
-#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE 
+#define CPUTYPE_BULLDOZER               46
+#define CPUTYPE_PILEDRIVER              47
+#define CPUTYPE_HASWELL 48

 #endif
--- a/cpuid_arm.c
+++ b/cpuid_arm.c
@ -0,0 +1,262 @@
+/**************************************************************************
+  Copyright (c) 2013, The OpenBLAS Project
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in
+  the documentation and/or other materials provided with the
+  distribution.
+  3. Neither the name of the OpenBLAS project nor the names of
+  its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *****************************************************************************/
+
+#include <string.h>
+
+#define CPU_UNKNOWN     	0
+#define CPU_ARMV6       	1
+#define CPU_ARMV7       	2
+#define CPU_CORTEXA15       	3
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "ARMV6",
+  "ARMV7",
+  "CORTEXA15"
+};
+
+
+int get_feature(char *search)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{	
+		if (!strcmp(t, search))   { return(1); }
+	}
+
+#endif
+	return(0);
+}
+
+
+int detect(void)
+{
+
+#ifdef linux
+
+	FILE *infile;
+  	char buffer[512], *p;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("model name", buffer, 10))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+  	if(p != NULL)
+	{
+
+		if (strstr(p, "ARMv7")) 
+		{
+			 if ( get_feature("vfpv4"))
+			 	return CPU_ARMV7;
+
+			 if ( get_feature("vfpv3"))
+			 	return CPU_ARMV7;
+
+			 if ( get_feature("vfp"))
+			 	return CPU_ARMV6;
+
+
+		}
+
+		if (strstr(p, "ARMv6")) 
+		{
+			 if ( get_feature("vfp"))
+			 	return CPU_ARMV6;
+		}
+
+
+	}
+#endif
+
+	return CPU_UNKNOWN;
+}
+
+char *get_corename(void)
+{
+	return cpuname[detect()];
+}
+
+void get_architecture(void)
+{
+	printf("ARM");
+}
+
+void get_subarchitecture(void)
+{
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV7:
+			printf("ARMV7");
+			break;
+
+		case CPU_ARMV6:
+			printf("ARMV6");
+			break;
+
+		default:
+			printf("UNKNOWN");
+			break;
+	}
+}
+
+void get_subdirname(void)
+{
+	printf("arm");
+}
+
+void get_cpuconfig(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV7:
+    			printf("#define ARMV7\n");
+    			printf("#define HAVE_VFP\n");
+    			printf("#define HAVE_VFPV3\n");
+			if ( get_feature("neon"))	printf("#define HAVE_NEON\n");
+			if ( get_feature("vfpv4"))	printf("#define HAVE_VFPV4\n");
+    			printf("#define L1_DATA_SIZE 65536\n");
+    			printf("#define L1_DATA_LINESIZE 32\n");
+    			printf("#define L2_SIZE 512488\n");
+    			printf("#define L2_LINESIZE 32\n");
+    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    			printf("#define DTB_SIZE 4096\n");
+    			printf("#define L2_ASSOCIATIVE 4\n");
+			break;
+
+		case CPU_ARMV6:
+    			printf("#define ARMV6\n");
+    			printf("#define HAVE_VFP\n");
+    			printf("#define L1_DATA_SIZE 65536\n");
+    			printf("#define L1_DATA_LINESIZE 32\n");
+    			printf("#define L2_SIZE 512488\n");
+    			printf("#define L2_LINESIZE 32\n");
+    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    			printf("#define DTB_SIZE 4096\n");
+    			printf("#define L2_ASSOCIATIVE 4\n");
+			break;
+
+	}
+}
+
+
+void get_libname(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV7:
+    			printf("armv7\n");
+			break;
+
+		case CPU_ARMV6:
+    			printf("armv6\n");
+			break;
+
+	}
+}
+
+
+void get_features(void)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{	
+		if (!strcmp(t, "vfp"))   { printf("HAVE_VFP=1\n"); continue; }
+		if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; }
+		if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; }
+		if (!strcmp(t, "neon"))  { printf("HAVE_NEON=1\n"); continue; }
+	}
+
+#endif
+	return;
+}
+
+
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@ -1051,11 +1051,14 @@ int get_cpuname(void){
      case 3:
 	switch (model) {
 	case 10:
+        case 14:
+	  // Ivy Bridge
 	  if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
 	  else
 	    return CPUTYPE_NEHALEM;
        case 12:
+	case 15:
          if(support_avx())
            return CPUTYPE_HASWELL;
          else
@ -1065,6 +1068,7 @@ int get_cpuname(void){
      case 4:
        switch (model) {
        case 5:
+	case 6:
          if(support_avx())
            return CPUTYPE_HASWELL;
          else
@ -1243,6 +1247,7 @@ static char *cpuname[] = {
  "BOBCAT",
  "BULLDOZER",
  "PILEDRIVER",
+  "HASWELL",
 };

 static char *lowercpuname[] = {
@ -1293,6 +1298,7 @@ static char *lowercpuname[] = {
  "bobcat",
  "bulldozer",
  "piledriver",
+  "haswell",
 };

 static char *corename[] = {
@ -1320,6 +1326,7 @@ static char *corename[] = {
  "BOBCAT",
  "BULLDOZER",
  "PILEDRIVER",
+  "HASWELL",
 };

 static char *corename_lower[] = {
@ -1347,6 +1354,7 @@ static char *corename_lower[] = {
  "bobcat",
  "bulldozer",
  "piledriver",
+  "haswell",
 };


@ -1453,11 +1461,13 @@ int get_coretype(void){
      case 3:
 	switch (model) {
 	case 10:
+	case 14:
 	  if(support_avx())
 	    return CORE_SANDYBRIDGE;
 	  else
 	    return CORE_NEHALEM; //OS doesn't support AVX
        case 12:
+	case 15:
          if(support_avx())
            return CORE_HASWELL;
          else
@ -1467,6 +1477,7 @@ int get_coretype(void){
      case 4:
        switch (model) {
        case 5:
+	case 6:
          if(support_avx())
            return CORE_HASWELL;
          else
@ -1547,7 +1558,13 @@ void get_cpuconfig(void){
      printf("#define L2_SIZE %d\n", info.size * 1024);
      printf("#define L2_ASSOCIATIVE %d\n", info.associative);
      printf("#define L2_LINESIZE %d\n", info.linesize);
+    } else {
+      //fall back for some virtual machines.
+      printf("#define L2_SIZE 1048576\n");
+      printf("#define L2_ASSOCIATIVE 6\n");
+      printf("#define L2_LINESIZE 64\n");
    }
+
    
    get_cacheinfo(CACHE_INFO_L3, &info);
    if (info.size > 0) {
--- a/ctest.c
+++ b/ctest.c
@ -124,3 +124,12 @@ ARCH_IA64
 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
 BINARY_64
 #endif
+
+#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__)
+ARCH_ARM
+#endif
+
+#if defined(__aarch64__)
+ARCH_ARM64
+#endif
+
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;

-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
-        if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
-        else
+#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
                if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
                else
                        if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;

-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
-	if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
-	else
+#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
 		if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 		else
 			if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@ -1,7 +1,7 @@
 TOPDIR	= ../..
 include ../../Makefile.system

-COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) 
+COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) 

 COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX)

@ -109,6 +109,9 @@ openblas_get_config.$(SUFFIX) : openblas_get_config.c
 openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

+openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
 blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
 	$(CC) $(CFLAGS) -c $< -o $(@F)

--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@ -74,6 +74,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/resource.h>
 #endif

+#ifndef likely
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#else
+#define likely(x) (x)
+#endif
+#endif
+#ifndef unlikely
+#ifdef __GNUC__
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define unlikely(x) (x)
+#endif
+#endif
+
 #ifdef SMP_SERVER

 #undef MONITOR
@ -584,6 +599,10 @@ static BLASULONG exec_queue_lock = 0;

 int exec_blas_async(BLASLONG pos, blas_queue_t *queue){

+#ifdef SMP_SERVER
+  // Handle lazy re-init of the thread-pool after a POSIX fork
+  if (unlikely(blas_server_avail == 0)) blas_thread_init();
+#endif
  BLASLONG i = 0;
  blas_queue_t *current = queue;
 #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
@ -708,7 +727,11 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 /* Execute Threads */
 int exec_blas(BLASLONG num, blas_queue_t *queue){

-   int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
+#ifdef SMP_SERVER
+  // Handle lazy re-init of the thread-pool after a POSIX fork
+  if (unlikely(blas_server_avail == 0)) blas_thread_init();
+#endif
+  int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);

 #ifdef TIMING_DEBUG
  BLASULONG start, stop;
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
  if (blas_server_avail){

    SetEvent(pool.killed);
-    
+
    for(i = 0; i < blas_num_threads - 1; i++){
-      WaitForSingleObject(blas_threads[i], INFINITE);
+     WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
+	 TerminateThread(blas_threads[i],0);
    }
    
    blas_server_avail = 0;
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@ -38,6 +38,7 @@

 #include "common.h"

+
 #ifdef ARCH_X86
 #define EXTERN extern
 #else
@ -63,18 +64,17 @@ extern gotoblas_t  gotoblas_BARCELONA;
 extern gotoblas_t  gotoblas_BOBCAT;
 #ifndef NO_AVX
 extern gotoblas_t  gotoblas_SANDYBRIDGE;
-//extern gotoblas_t  gotoblas_BULLDOZER;
-//extern gotoblas_t  gotoblas_PILEDRIVER;
+extern gotoblas_t  gotoblas_BULLDOZER;
+extern gotoblas_t  gotoblas_PILEDRIVER;
+extern gotoblas_t  gotoblas_HASWELL;
 #else
 //Use NEHALEM kernels for sandy bridge
 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
-#endif
-
+#define gotoblas_HASWELL gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
+#endif

-//Use sandy bridge kernels for haswell.
-#define gotoblas_HASWELL gotoblas_SANDYBRIDGE

 #define VENDOR_INTEL      1
 #define VENDOR_AMD        2
@ -109,6 +109,11 @@ int support_avx(){
 #endif
 }

+extern void openblas_warning(int verbose, const char * msg);
+#define FALLBACK_VERBOSE 1
+#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
+#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
+
 static int get_vendor(void){
  int eax, ebx, ecx, edx;
  char vendor[13];
@ -180,38 +185,38 @@ static gotoblas_t *get_coretype(void){
 	  if(support_avx())
 	    return &gotoblas_SANDYBRIDGE;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	return NULL;
      case 3:
 	//Intel Sandy Bridge 22nm (Ivy Bridge?)
-	if (model == 10) {
+	if (model == 10 || model == 14) {
 	  if(support_avx())
 	    return &gotoblas_SANDYBRIDGE;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	//Intel Haswell
-	if (model == 12) {
+	if (model == 12 || model == 15) {
 	  if(support_avx())
 	    return &gotoblas_HASWELL;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	return NULL;
      case 4:
 		//Intel Haswell
-	if (model == 5) {
+	if (model == 5 || model == 6) {
 	  if(support_avx())
 	    return &gotoblas_HASWELL;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
@ -224,7 +229,19 @@ static gotoblas_t *get_coretype(void){
  }

  if (vendor == VENDOR_AMD){
-    if (family <= 0xe) return &gotoblas_ATHLON;
+    if (family <= 0xe) {
+        // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
+        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if (eax & 0xffff >= 0x01) {
+            cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+            if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
+              return NULL;
+          }
+        else
+          return NULL;
+
+        return &gotoblas_ATHLON;
+      }
    if (family == 0xf){
      if ((exfamily == 0) || (exfamily == 2)) {
 	if (ecx & (1 <<  0)) return &gotoblas_OPTERON_SSE3; 
@ -237,7 +254,7 @@ static gotoblas_t *get_coretype(void){
 	  if(support_avx())
 	    return &gotoblas_BULLDOZER;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 	  }
 	}else if(model == 2){
@ -245,7 +262,7 @@ static gotoblas_t *get_coretype(void){
 	  if(support_avx())
 	    return &gotoblas_PILEDRIVER;
 	  else{
-	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
@ -287,6 +304,7 @@ static char *corename[] = {
    "Bobcat",
    "Bulldozer",
    "Piledriver",
+    "Haswell",
 };

 char *gotoblas_corename(void) {
@ -309,7 +327,8 @@ char *gotoblas_corename(void) {
  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
-  if (gotoblas == &gotoblas_PILEDRIVER)    return corename[19];
+  if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
+  if (gotoblas == &gotoblas_HASWELL)      return corename[20];

  return corename[0];
 }
@ -338,7 +357,7 @@ void gotoblas_dynamic_init(void) {
  if (gotoblas && gotoblas -> init) {
    gotoblas -> init();
  } else {
-    fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
    exit(1);
  }
  
--- a/driver/others/init.c
+++ b/driver/others/init.c
@ -494,7 +494,7 @@ static void disable_affinity(void) {

 #ifndef USE_OPENMP
  for(i=0; i< count; i++){
-    lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
+    lprocmask[i] &= common->avail[i];
  }
 #endif

@ -754,7 +754,7 @@ void gotoblas_affinity_init(void) {
    if (common -> num_nodes > 1) numa_mapping();

    common -> final_num_procs = 0;
-    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
+    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.

    for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0;

--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -143,6 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 gotoblas_t *gotoblas = NULL;
 #endif

+extern void openblas_warning(int verbose, const char * msg);
+
 #ifndef SMP

 #define blas_cpu_number 1
@ -253,6 +255,23 @@ int  goto_get_num_procs  (void) {
  return blas_cpu_number;
 }

+void openblas_fork_handler()
+{
+  // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
+  // built with "make USE_OPENMP=0".
+  // Hanging can still happen when OpenBLAS is built against the libgomp
+  // implementation of OpenMP. The problem is tracked at:
+  //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
+  // In the mean time build with USE_OPENMP=0 or link against another
+  // implementation of OpenMP.
+#if !defined(OS_WINDOWS) && defined(SMP_SERVER)
+  int err;
+  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
+  if(err != 0)
+    openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
+#endif
+}
+
 int blas_get_cpu_number(void){
  char *p;
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
@ -363,7 +382,7 @@ static void *alloc_mmap(void *address){
 #define BENCH_ITERATION 4
 #define SCALING		2

-static inline BLASULONG run_bench(BLASULONG address, long size) {
+static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {

  BLASULONG original, *p;
  BLASULONG start, stop, min;
@ -450,12 +469,12 @@ static void *alloc_mmap(void *address){
 	current = (SCALING - 1) * BUFFER_SIZE;
 	
 	while(current > 0) {
-	  *(long *)start = (long)start + PAGESIZE;
+	  *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
 	  start += PAGESIZE;
 	  current -= PAGESIZE;
 	}
 	
-	*(long *)(start - PAGESIZE) = (BLASULONG)map_address;
+	*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
 	
 	start = (BLASULONG)map_address;
 	
@ -1170,7 +1189,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,

 #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)

-  long size;
+  size_t size;
  BLASULONG buffer;

  size   = BUFFER_SIZE - PAGESIZE;
@ -1268,6 +1287,9 @@ void CONSTRUCTOR gotoblas_init(void) {

  if (gotoblas_initialized) return;

+#ifdef SMP
+  openblas_fork_handler();
+#endif

 #ifdef PROFILE
   moncontrol (0);
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@ -0,0 +1,51 @@
+/***************************************************************************
+Copyright (c) 2013,                               The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*****************************************************************************/
+
+#include "common.h"
+
+int openblas_verbose() {
+  int ret=0;
+  char *p;
+  p = getenv("OPENBLAS_VERBOSE");
+  if (p) ret = atoi(p);
+  if(ret<0) ret=0;
+  return ret;
+}
+
+void openblas_warning(int verbose, const char * msg) {
+  int current_verbose;
+  current_verbose=openblas_verbose();
+  if(current_verbose >= verbose){
+    fprintf(stderr, "%s", msg);
+  }
+}
--- a/exports/Makefile
+++ b/exports/Makefile
@ -22,6 +22,10 @@ ifndef NEED2UNDERSCORES
 NEED2UNDERSCORES=0
 endif

+ifndef ONLY_CBLAS
+ONLY_CBLAS	= 0
+endif
+
 ifeq ($(OSNAME), WINNT)
 ifeq ($(F_COMPILER), GFORTRAN)
 EXTRALIB += -lgfortran
@ -98,16 +102,16 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
 	-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)

 libopenblas.def : gensymbol
-	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 libgoto2_shared.def : gensymbol
-	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol win2k    $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 libgoto_hpl.def : gensymbol
-	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 $(LIBDYNNAME) : ../$(LIBNAME) osx.def
-	$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+	$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)

 symbol.$(SUFFIX) : symbol.S
 	$(CC) $(CFLAGS) -c -o $(@F) $^
@ -120,14 +124,17 @@ ifeq ($(OSNAME), Linux)
 so : ../$(LIBSONAME)

 ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
+ifneq ($(C_COMPILER), LSB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
 	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-	-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
-ifneq ($(C_COMPILER), LSB)
+	-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 else
-#Use FC on LSB
-	$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
+#for LSB
+	env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
+	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
+	-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
+	$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 endif
 	rm -f linktest

@ -141,7 +148,7 @@ so : ../$(LIBSONAME)
 ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS)  -shared -o ../$(LIBSONAME) \
 	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-	-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
+	$(FEXTRALIB) $(EXTRALIB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 	rm -f linktest

@ -191,23 +198,23 @@ static : ../$(LIBNAME)
 	rm -f goto.$(SUFFIX)

 linux.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 osx.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 aix.def : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
+	perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)

 symbol.S : gensymbol
-	perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > symbol.S
+	perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > symbol.S

 test : linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
 	rm -f linktest

 linktest.c : gensymbol ../Makefile.system ../getarch.c
-	perl ./gensymbol linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > linktest.c
+	perl ./gensymbol linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > linktest.c

 clean ::
 	@rm -f *.def *.dylib __.SYMDEF*
--- a/exports/gensymbol
+++ b/exports/gensymbol
--- a/getarch.c
+++ b/getarch.c
@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "SANDYBRIDGE"
 #endif

+#ifdef FORCE_HASWELL
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "HASWELL"
+#define ARCHCONFIG   "-DHASWELL " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DFMA3"
+#define LIBNAME   "haswell"
+#define CORENAME  "HASWELL"
+#endif
+
 #ifdef FORCE_ATOM
 #define FORCE
 #define FORCE_INTEL
@ -354,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "OPTERON"
 #endif

-#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_PILEDRIVER) || defined (FORCE_BULLDOZER)
+#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
@ -384,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "BOBCAT"
 #endif

-#if 0
+#if defined (FORCE_BULLDOZER)
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
@ -400,7 +415,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "BULLDOZER"
 #endif

-#if 0 
+#if defined (FORCE_PILEDRIVER)
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
@ -679,6 +694,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "generic"
 #endif

+#ifdef FORCE_ARMV7
+#define FORCE
+#define ARCHITECTURE    "ARM"
+#define SUBARCHITECTURE "ARMV7"
+#define SUBDIRNAME      "arm"
+#define ARCHCONFIG   "-DARMV7 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
+       "-DHAVE_VFPV3 -DHAVE_VFP"
+#define LIBNAME   "armv7"
+#define CORENAME  "ARMV7"
+#else
+#endif
+
+#ifdef FORCE_ARMV6
+#define FORCE
+#define ARCHITECTURE    "ARM"
+#define SUBARCHITECTURE "ARMV6"
+#define SUBDIRNAME      "arm"
+#define ARCHCONFIG   "-DARMV6 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
+       "-DHAVE_VFP"
+#define LIBNAME   "armv6"
+#define CORENAME  "ARMV6"
+#else
+#endif
+
+#ifdef FORCE_ARMV8
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "ARMV8"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DARMV8 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
+       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
+#define LIBNAME   "armv8"
+#define CORENAME  "ARMV8"
+#else
+#endif
+
+
 #ifndef FORCE

 #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
@ -719,6 +780,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif

+#ifdef __arm__
+#include "cpuid_arm.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
+
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
 #endif
@ -773,7 +840,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
    printf("CORE=%s\n", CORENAME);
 #else    
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
    printf("CORE=%s\n", get_corename());
 #endif
 #endif
@ -788,6 +855,11 @@ int main(int argc, char *argv[]){

    printf("NUM_CORES=%d\n", get_num_cores());

+#if defined(__arm__) && !defined(FORCE)
+        get_features();
+#endif
+
+
 #if defined(__i386__) || defined(__x86_64__)
 #ifndef FORCE
    get_sse();
--- a/interface/Makefile
+++ b/interface/Makefile
@ -786,7 +786,7 @@ endif

 qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c
 	$(CC) -c $(CFLAGS) -o $(@F) $<
-	
+
 ifndef USE_NETLIB_GEMV
 cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c
 	$(CC) -c $(CFLAGS) -o $(@F) $<
--- a/interface/gemm.c
+++ b/interface/gemm.c
@ -71,6 +71,10 @@
 #endif
 #endif

+#ifndef GEMM_MULTITHREAD_THRESHOLD
+# define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
 static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
  GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@ -1,3 +1,37 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2014/02/28 Saar
+*	Test with lapack-3.5.0	: OK 
+*
+**************************************************************************************/
+
+
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
@ -7,6 +41,8 @@
 #define  GAMSQ   16777216.e0
 #define  RGAMSQ  5.9604645e-8

+#define  TWO 2.e0
+
 #ifdef DOUBLE
 #define ABS(x) fabs(x)
 #else
@ -25,181 +61,168 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){

 #endif

-    FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22;
-    int igo, flag;
-    FLOAT dtemp;
+	FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;

-#ifndef CBLAS
-  PRINT_DEBUG_NAME;
-#else
-  PRINT_DEBUG_CNAME;
-#endif
+	if(*dd1 < ZERO)
+	{
+		dflag = -ONE;
+		dh11  = ZERO;
+		dh12  = ZERO;
+		dh21  = ZERO;
+		dh22  = ZERO;

-    dh11 = ZERO;
-    dh12 = ZERO;
-    dh21 = ZERO;
-    dh22 = ZERO;
+		*dd1  = ZERO;
+		*dd2  = ZERO;
+		*dx1  = ZERO;
+	}
+	else
+	{
+		dp2 = *dd2 * dy1;
+		if(dp2 == ZERO)
+		{
+			dflag = -TWO;
+			dparam[0] = dflag;
+			return;
+		}
+		dp1 = *dd1 * *dx1;
+		dq2 =  dp2 * dy1;
+		dq1 =  dp1 * *dx1;
+		if(ABS(dq1) > ABS(dq2))
+		{
+			dh21 = -  dy1 / *dx1;
+			dh12 =    dp2 /  dp1;

-    if (*dd1 < ZERO) goto L60;
+			du   = ONE - dh12 * dh21;
+			if(du > ZERO)
+			{
+				dflag = ZERO;
+				*dd1  = *dd1 / du;
+				*dd2  = *dd2 / du;
+				*dx1  = *dx1 * du;

-    dp2 = *dd2 * dy1;
+			}
+		}
+		else
+		{
+			if(dq2 < ZERO)
+			{
+				dflag = -ONE;
+				
+				dh11  = ZERO;
+				dh12  = ZERO;
+				dh21  = ZERO;
+				dh22  = ZERO;

-    if (dp2 == ZERO) {
-      flag = -2;
-      goto L260;
-    }
+				*dd1  = ZERO;
+				*dd2  = ZERO;
+				*dx1  = ZERO;
+			}
+			else
+			{
+				dflag = ONE;

-    dp1 = *dd1 * *dx1;
-    dq2 =  dp2 * dy1;
-    dq1 =  dp1 * *dx1;
+				dh11  =  dp1 /  dp2;
+				dh22  = *dx1 /  dy1;
+				du    =  ONE + dh11 * dh22;
+				dtemp = *dd2 / du;

-    if (! (ABS(dq1) > ABS(dq2))) goto L40;
-
-    dh21 = -(dy1) / *dx1;
-    dh12 = dp2 / dp1;
-
-    du = ONE - dh12 * dh21;
-
-    if (du <= ZERO) goto L60;
-
-    flag = 0;
-    *dd1 /= du;
-    *dd2 /= du;
-    *dx1 *= du;
-
-    goto L100;
-
-L40:
-    if (dq2 < ZERO) goto L60;
-
-    flag = 1;
-    dh11  = dp1 / dp2;
-    dh22  = *dx1 / dy1;
-    du    = ONE + dh11 * dh22;
-    dtemp = *dd2 / du;
-    *dd2  = *dd1 / du;
-    *dd1  = dtemp;
-    *dx1  = dy1 * du;
-    goto L100;
-
-L60:
-    flag = -1;
-    dh11 = ZERO;
-    dh12 = ZERO;
-    dh21 = ZERO;
-    dh22 = ZERO;
-
-    *dd1 = ZERO;
-    *dd2 = ZERO;
-    *dx1 = ZERO;
-    goto L220;
+				*dd2  = *dd1 / du;
+				*dd1  = dtemp;
+				*dx1  = dy1 * du;
+			}			
+		}


-L70:
-    if (flag < 0) goto L90;
- 
-    if (flag > 0) goto L80;
- 
-    dh11 = ONE;
-    dh22 = ONE;
-    flag = -1;
-    goto L90;
+		if(*dd1 != ZERO)
+		{
+			while( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) )
+			{
+				if(dflag == ZERO)
+				{
+					dh11  =  ONE;
+					dh22  =  ONE;
+					dflag = -ONE;
+				}
+				else
+				{
+					dh21  = -ONE;
+					dh12  =  ONE;
+					dflag = -ONE;
+				}
+				if( *dd1 <= RGAMSQ )
+				{
+					*dd1  = *dd1 * (GAM * GAM);
+					*dx1  = *dx1 / GAM;
+					dh11  = dh11 / GAM;
+					dh12  = dh12 / GAM;
+				}
+				else
+				{
+					*dd1  = *dd1 / (GAM * GAM);
+					*dx1  = *dx1 * GAM;
+					dh11  = dh11 * GAM;
+					dh12  = dh12 * GAM;
+				}
+			}
+		}
+			
+		if(*dd2 != ZERO)
+		{
+			while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) )
+			{
+				if(dflag == ZERO)
+				{
+					dh11  =  ONE;
+					dh22  =  ONE;
+					dflag = -ONE;
+				}
+				else
+				{
+					dh21  = -ONE;
+					dh12  =  ONE;
+					dflag = -ONE;
+				}
+				if( ABS(*dd2) <= RGAMSQ )
+				{
+					*dd2  = *dd2 * (GAM * GAM);
+					dh21  = dh21 / GAM;
+					dh22  = dh22 / GAM;
+				}
+				else
+				{
+					*dd2  = *dd2 / (GAM * GAM);
+					dh21  = dh21 * GAM;
+					dh22  = dh22 * GAM;
+				}
+			}
+		}
+					
+	}

-L80:
-    dh21 = -ONE;
-    dh12 = ONE;
-    flag = -1;
+	if(dflag < ZERO)
+	{
+		dparam[1] = dh11;
+		dparam[2] = dh21;
+		dparam[3] = dh12;
+		dparam[4] = dh22;
+	}
+	else
+	{
+		if(dflag == ZERO)
+		{
+			dparam[2] = dh21;
+			dparam[3] = dh12;
+		}
+		else
+		{
+			dparam[1] = dh11;
+			dparam[4] = dh22;
+		}
+	}

-L90:
-    switch (igo) {
-	case 0: goto L120;
-	case 1: goto L150;
-	case 2: goto L180;
-	case 3: goto L210;
-    }

-L100:
-    if (!(*dd1 <= RGAMSQ)) goto L130;
-    if (*dd1 == ZERO) goto L160;
-    igo = 0;
-    goto L70;
-
-L120:
-    *dd1 *= GAM * GAM;
-    *dx1 /= GAM;
-    dh11 /= GAM;
-    dh12 /= GAM;
-    goto L100;
-
-L130:
-    if (! (*dd1 >= GAMSQ)) {
-	goto L160;
-    }
-    igo = 1;
-    goto L70;
-
-L150:
-    *dd1 /= GAM * GAM;
-    *dx1 *= GAM;
-    dh11 *= GAM;
-    dh12 *= GAM;
-    goto L130;
-
-L160:
-    if (! (ABS(*dd2) <= RGAMSQ)) {
-	goto L190;
-    }
-    if (*dd2 == ZERO) {
-	goto L220;
-    }
-    igo = 2;
-    goto L70;
-
-L180:
-/* Computing 2nd power */
-    *dd2 *= GAM * GAM;
-    dh21 /= GAM;
-    dh22 /= GAM;
-    goto L160;
-
-L190:
-    if (! (ABS(*dd2) >= GAMSQ)) {
-	goto L220;
-    }
-    igo = 3;
-    goto L70;
-
-L210:
-/* Computing 2nd power */
-    *dd2 /= GAM * GAM;
-    dh21 *= GAM;
-    dh22 *= GAM;
-    goto L190;
-
-L220:
-    if (flag < 0) {
-	goto L250;
-    } else if (flag == 0) {
-	goto L230;
-    } else {
-	goto L240;
-    }
-L230:
-    dparam[2] = dh21;
-    dparam[3] = dh12;
-    goto L260;
-L240:
-    dparam[2] = dh11;
-    dparam[4] = dh22;
-    goto L260;
-L250:
-    dparam[1] = dh11;
-    dparam[2] = dh21;
-    dparam[3] = dh12;
-    dparam[4] = dh22;
-L260:
-    dparam[0] = (FLOAT) flag;
-    return;
+	dparam[0] = dflag;
+	return;
 }


--- a/interface/sdsdot.c
+++ b/interface/sdsdot.c
@ -52,8 +52,8 @@ FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *
  FLOATRET ret;

  PRINT_DEBUG_NAME;
-
-  if (n <= 0) return 0.;
+  
+  if (n <= 0) return(*a) ;

  IDEBUG_START;

@ -80,7 +80,7 @@ FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint in

  PRINT_DEBUG_CNAME;

-  if (n <= 0) return 0.;
+  if (n <= 0) return (alpha);

  IDEBUG_START;

--- a/interface/syr2k.c
+++ b/interface/syr2k.c
@ -146,8 +146,10 @@ void NAME(char *UPLO, char *TRANS,
  if (uplo_arg  == 'L') uplo  = 1;

  if (trans_arg == 'N') trans = 0;
+#ifndef HEMM
  if (trans_arg == 'T') trans = 1;
  if (trans_arg == 'R') trans = 0;
+#endif
  if (trans_arg == 'C') trans = 1;
  
  nrowa = args.n;
--- a/interface/syrk.c
+++ b/interface/syrk.c
@ -149,8 +149,10 @@ void NAME(char *UPLO, char *TRANS,
  if (uplo_arg  == 'L') uplo  = 1;

  if (trans_arg == 'N') trans = 0;
+#ifndef HEMM
  if (trans_arg == 'T') trans = 1;
  if (trans_arg == 'R') trans = 0;
+#endif
  if (trans_arg == 'C') trans = 1;
  
  nrowa = args.n;
--- a/interface/trtri.c
+++ b/interface/trtri.c
@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
 };
 #endif

-extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);

 int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){

@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
  if (args.nthreads == 1) {
 #endif

-#if DOUBLE
-    // double trtri_U single thread error
-    // call dtrtri from lapack for a walk around.
-    if(uplo==0){
-      BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info);
-#ifndef PPC440
-      blas_memory_free(buffer);
-#endif
-      return 0;
-    }
-#endif
-
    *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
    
 #ifdef SMP
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@ -674,7 +674,7 @@ $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

 $(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) 
-	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@

 $(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) 
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS)
 USE_GEMM3M = 1
 endif

+ifeq ($(ARCH), arm)
+USE_TRMM = 1
+endif
+
+ifeq ($(ARCH), arm64)
+USE_TRMM = 1
+endif
+
+ifeq ($(TARGET), LOONGSON3B)												 
+USE_TRMM = 1
+endif
+
+
+
 SKERNELOBJS	+= \
 	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@

-ifeq ($(TARGET), LOONGSON3B)												 
+
+ifdef USE_TRMM											 
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@

@ -668,6 +683,9 @@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif

+
+
+
 $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@

--- a/kernel/arm/KERNEL
+++ b/kernel/arm/KERNEL
@ -0,0 +1,46 @@
+ifndef SNRM2KERNEL
+SNRM2KERNEL = nrm2.c
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = nrm2.c
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = znrm2.c
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.c
+endif
+
+ifndef SCABS_KERNEL
+SCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL	= ../generic/lsame.c
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@ -0,0 +1,142 @@
+SAMAXKERNEL  = iamax_vfp.S
+DAMAXKERNEL  = iamax_vfp.S
+CAMAXKERNEL  = iamax_vfp.S
+ZAMAXKERNEL  = iamax_vfp.S
+
+SAMINKERNEL  = iamax_vfp.S
+DAMINKERNEL  = iamax_vfp.S
+CAMINKERNEL  = iamax_vfp.S
+ZAMINKERNEL  = iamax_vfp.S
+
+SMAXKERNEL   = iamax_vfp.S
+DMAXKERNEL   = iamax_vfp.S
+
+SMINKERNEL   = iamax_vfp.S
+DMINKERNEL   = iamax_vfp.S
+
+ISAMAXKERNEL = iamax_vfp.S
+IDAMAXKERNEL = iamax_vfp.S
+ICAMAXKERNEL = iamax_vfp.S
+IZAMAXKERNEL = iamax_vfp.S
+
+ISAMINKERNEL = iamax_vfp.S
+IDAMINKERNEL = iamax_vfp.S
+ICAMINKERNEL = iamax_vfp.S
+IZAMINKERNEL = iamax_vfp.S
+
+ISMAXKERNEL  = iamax_vfp.S
+IDMAXKERNEL  = iamax_vfp.S
+
+ISMINKERNEL  = iamax_vfp.S
+IDMINKERNEL  = iamax_vfp.S
+
+SASUMKERNEL  = asum_vfp.S
+DASUMKERNEL  = asum_vfp.S
+CASUMKERNEL  = asum_vfp.S
+ZASUMKERNEL  = asum_vfp.S
+
+SAXPYKERNEL  = axpy_vfp.S
+DAXPYKERNEL  = axpy_vfp.S
+CAXPYKERNEL  = axpy_vfp.S
+ZAXPYKERNEL  = axpy_vfp.S
+
+SCOPYKERNEL  = copy.c
+DCOPYKERNEL  = copy.c
+CCOPYKERNEL  = zcopy.c
+ZCOPYKERNEL  = zcopy.c
+
+SDOTKERNEL   = sdot_vfp.S
+DDOTKERNEL   = ddot_vfp.S
+CDOTKERNEL   = cdot_vfp.S
+ZDOTKERNEL   = zdot_vfp.S
+
+SNRM2KERNEL  = nrm2_vfp.S
+DNRM2KERNEL  = nrm2_vfp.S
+CNRM2KERNEL  = nrm2_vfp.S
+ZNRM2KERNEL  = nrm2_vfp.S
+
+SROTKERNEL   = rot_vfp.S
+DROTKERNEL   = rot_vfp.S
+CROTKERNEL   = rot_vfp.S
+ZROTKERNEL   = rot_vfp.S
+
+SSCALKERNEL  =  scal_vfp.S
+DSCALKERNEL  =  scal_vfp.S
+CSCALKERNEL  =  scal_vfp.S
+ZSCALKERNEL  =  scal_vfp.S
+
+SSWAPKERNEL  = swap_vfp.S
+DSWAPKERNEL  = swap_vfp.S
+CSWAPKERNEL  = swap_vfp.S
+ZSWAPKERNEL  = swap_vfp.S
+
+SGEMVNKERNEL = gemv_n_vfp.S
+DGEMVNKERNEL = gemv_n_vfp.S
+CGEMVNKERNEL = cgemv_n_vfp.S
+ZGEMVNKERNEL = zgemv_n_vfp.S
+
+SGEMVTKERNEL = gemv_t_vfp.S
+DGEMVTKERNEL = gemv_t_vfp.S
+CGEMVTKERNEL = cgemv_t_vfp.S
+ZGEMVTKERNEL = zgemv_t_vfp.S
+
+STRMMKERNEL	= strmm_kernel_4x2_vfp.S
+DTRMMKERNEL	= dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL	= ctrmm_kernel_2x2_vfp.S
+ZTRMMKERNEL	= ztrmm_kernel_2x2_vfp.S
+
+SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S		
+SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
+SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S		
+DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
+DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
+CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
+CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
+ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
+ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+
+
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@ -0,0 +1,141 @@
+SAMAXKERNEL  = iamax_vfp.S
+DAMAXKERNEL  = iamax_vfp.S
+CAMAXKERNEL  = iamax_vfp.S
+ZAMAXKERNEL  = iamax_vfp.S
+
+SAMINKERNEL  = iamax_vfp.S
+DAMINKERNEL  = iamax_vfp.S
+CAMINKERNEL  = iamax_vfp.S
+ZAMINKERNEL  = iamax_vfp.S
+
+SMAXKERNEL   = iamax_vfp.S
+DMAXKERNEL   = iamax_vfp.S
+
+SMINKERNEL   = iamax_vfp.S
+DMINKERNEL   = iamax_vfp.S
+
+ISAMAXKERNEL = iamax_vfp.S
+IDAMAXKERNEL = iamax_vfp.S
+ICAMAXKERNEL = iamax_vfp.S
+IZAMAXKERNEL = iamax_vfp.S
+
+ISAMINKERNEL = iamax_vfp.S
+IDAMINKERNEL = iamax_vfp.S
+ICAMINKERNEL = iamax_vfp.S
+IZAMINKERNEL = iamax_vfp.S
+
+ISMAXKERNEL  = iamax_vfp.S
+IDMAXKERNEL  = iamax_vfp.S
+
+ISMINKERNEL  = iamax_vfp.S
+IDMINKERNEL  = iamax_vfp.S
+
+SSWAPKERNEL  = swap_vfp.S
+DSWAPKERNEL  = swap_vfp.S
+CSWAPKERNEL  = swap_vfp.S
+ZSWAPKERNEL  = swap_vfp.S
+
+SASUMKERNEL  = asum_vfp.S
+DASUMKERNEL  = asum_vfp.S
+CASUMKERNEL  = asum_vfp.S
+ZASUMKERNEL  = asum_vfp.S
+
+SAXPYKERNEL  = axpy_vfp.S
+DAXPYKERNEL  = axpy_vfp.S
+CAXPYKERNEL  = axpy_vfp.S
+ZAXPYKERNEL  = axpy_vfp.S
+
+SCOPYKERNEL  = copy.c
+DCOPYKERNEL  = copy.c
+CCOPYKERNEL  = zcopy.c
+ZCOPYKERNEL  = zcopy.c
+
+SDOTKERNEL   = sdot_vfp.S
+DDOTKERNEL   = ddot_vfp.S
+CDOTKERNEL   = cdot_vfp.S
+ZDOTKERNEL   = zdot_vfp.S
+
+SNRM2KERNEL  = nrm2_vfpv3.S
+DNRM2KERNEL  = nrm2_vfpv3.S
+CNRM2KERNEL  = nrm2_vfpv3.S
+ZNRM2KERNEL  = nrm2_vfpv3.S
+
+SROTKERNEL   = rot_vfp.S
+DROTKERNEL   = rot_vfp.S
+CROTKERNEL   = rot_vfp.S
+ZROTKERNEL   = rot_vfp.S
+
+SSCALKERNEL  = scal_vfp.S
+DSCALKERNEL  = scal.c
+CSCALKERNEL  = scal_vfp.S
+ZSCALKERNEL  = scal_vfp.S
+
+SGEMVNKERNEL = gemv_n_vfp.S
+DGEMVNKERNEL = gemv_n.c
+CGEMVNKERNEL = cgemv_n_vfp.S
+ZGEMVNKERNEL = zgemv_n_vfp.S
+
+SGEMVTKERNEL = gemv_t_vfp.S
+DGEMVTKERNEL = gemv_t_vfp.S
+CGEMVTKERNEL = cgemv_t_vfp.S
+ZGEMVTKERNEL = zgemv_t_vfp.S
+
+STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S		
+CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
+ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
+
+#SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c		
+SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S		
+SGEMMINCOPY    =  
+SGEMMITCOPY    = 
+SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
+SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
+SGEMMINCOPYOBJ = 
+SGEMMITCOPYOBJ = 
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S		
+DGEMMINCOPY    =  
+DGEMMITCOPY    =  
+DGEMMONCOPY    =  dgemm_ncopy_4_vfp.S
+DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
+DGEMMINCOPYOBJ = 
+DGEMMITCOPYOBJ = 
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
+CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
+CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
+ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
+ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
--- a/kernel/arm/Makefile
+++ b/kernel/arm/Makefile
@ -0,0 +1,2 @@
+clean ::
+
--- a/kernel/arm/amax.c
+++ b/kernel/arm/amax.c
@ -0,0 +1,73 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n < 0 || inc_x < 1 ) return(maxf);
+
+	maxf=ABS(x[0]);
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > ABS(maxf) ) 
+		{
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+	
+
--- a/kernel/arm/amin.c
+++ b/kernel/arm/amin.c
@ -0,0 +1,73 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n < 0 || inc_x < 1 ) return(minf);
+
+	minf=ABS(x[0]);
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) ) 
+		{
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+	
+
--- a/kernel/arm/asum.c
+++ b/kernel/arm/asum.c
@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n < 0 || inc_x < 1 ) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += ABS(x[i]);
+		i += inc_x;
+	}
+	return(sumf);
+}
+	
+
--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@ -0,0 +1,481 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/11 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmiad	X!, { d4 - d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	fldmiad	X!, { d6 - d7 }
+	vabs.f64   d6,  d6
+	vadd.f64   d1  , d1,  d5
+	vabs.f64   d7,  d7
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmiad	X!, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	fldmiad	X, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmiad	X, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F4
+
+	fldmias	X!, { s4 - s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	fldmias	X!, { s6 - s7 }
+	vabs.f32   s6,  s6
+	vadd.f32   s1  , s1,  s5
+	vabs.f32   s7,  s7
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmias	X!, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	fldmias	X, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	fldmias	X, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	fldmias	X, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	fldmias	X, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmias	X, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmiad	X!, { d4 - d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	fldmiad	X!, { d6 - d7 }
+	vabs.f64   d6,  d6
+	vadd.f64   d1  , d1,  d5
+	vabs.f64   d7,  d7
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+	pld	[ X, #X_PRE  ]
+	fldmiad	X!, { d4 - d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	fldmiad	X!, { d6 - d7 }
+	vabs.f64   d6,  d6
+	vadd.f64   d1  , d1,  d5
+	vabs.f64   d7,  d7
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmiad	X!, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+
+	fldmiad	X!, { d4 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+
+
+.endm
+
+
+.macro KERNEL_S4
+
+	fldmiad	X, { d4 -d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 -d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 -d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	fldmiad	X, { d4 -d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmiad	X, { d4 -d5 }
+	vabs.f64   d4,  d4
+	vadd.f64   d0  , d0,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmias	X!, { s4 - s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	fldmias	X!, { s6 - s7 }
+	vabs.f32   s6,  s6
+	vadd.f32   s1  , s1,  s5
+	vabs.f32   s7,  s7
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+	fldmias	X!, { s4 - s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	fldmias	X!, { s6 - s7 }
+	vabs.f32   s6,  s6
+	vadd.f32   s1  , s1,  s5
+	vabs.f32   s7,  s7
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmias	X!, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+
+	fldmias	X!, { s4 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	fldmias	X, { s4 -s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	fldmias	X, { s4 -s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	fldmias	X, { s4 -s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	fldmias	X, { s4 -s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmias	X, { s4 -s5 }
+	vabs.f32   s4,  s4
+	vadd.f32   s0  , s0,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+.endm
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+#if defined(DOUBLE)
+	vsub.f64                d0 , d0 , d0
+	vsub.f64                d1 , d1 , d1
+#else
+	vsub.f32                s0 , s0 , s0
+	vsub.f32                s1 , s1 , s1
+#endif
+
+	cmp	N, #0
+	ble	asum_kernel_L999
+
+	cmp	INC_X, #0
+	beq	asum_kernel_L999
+
+	cmp	INC_X, #1
+	bne	asum_kernel_S_BEGIN
+
+
+asum_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	asum_kernel_F1
+
+	.align 5
+
+asum_kernel_F4:
+
+#if !defined(DOUBLE) && !defined(COMPLEX)
+	pld	[ X, #X_PRE  ]
+#endif
+	KERNEL_F4
+
+	subs	I, I, #1
+	ble	asum_kernel_F1
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	asum_kernel_F4
+
+asum_kernel_F1:
+
+	ands	I, N, #3
+	ble	asum_kernel_L999
+
+asum_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     asum_kernel_F10
+
+	b	asum_kernel_L999
+
+asum_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+	asrs	I, N, #2					// I = N / 4
+	ble	asum_kernel_S1
+
+	.align 5
+
+asum_kernel_S4:
+
+	KERNEL_S4
+
+	subs	I, I, #1
+	bne	asum_kernel_S4
+
+asum_kernel_S1:
+
+	ands	I, N, #3
+	ble	asum_kernel_L999
+
+asum_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     asum_kernel_S10
+
+
+asum_kernel_L999:
+
+
+#if defined(DOUBLE)
+	vadd.f64	d0 , d0, d1				// set return value
+#else
+	vadd.f32	s0 , s0, s1				// set return value
+#endif
+
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/axpy.c
@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+	if ( da == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+	
+
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@ -0,0 +1,503 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/14 Saar
+* 	 BLASTEST 		: xOK
+* 	 CTEST			: xOK
+* 	 TEST			: xOK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_INC_X	[fp, #0 ]
+#define	OLD_Y		[fp, #4 ]
+#define	OLD_INC_Y	[fp, #8 ]
+
+
+#define	N	r0
+#define Y	r1
+#define	INC_X	r2
+#define	X	r3
+#define INC_Y	r4
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+/*****************************************************************************************/
+
+#if !defined(CONJ)
+
+#if defined(DOUBLE)
+
+#define	FMAC_R1	fmacd
+#define FMAC_R2 fnmacd
+#define	FMAC_I1	fmacd
+#define FMAC_I2 fmacd
+
+#else
+
+#define	FMAC_R1	fmacs
+#define FMAC_R2 fnmacs
+#define	FMAC_I1	fmacs
+#define FMAC_I2 fmacs
+
+#endif
+
+#else	// CONJ
+
+#if defined(DOUBLE)
+
+#define	FMAC_R1	fmacd
+#define FMAC_R2 fmacd
+#define	FMAC_I1	fnmacd
+#define FMAC_I2 fmacd
+
+#else
+
+#define	FMAC_R1	fmacs
+#define FMAC_R2 fmacs
+#define	FMAC_I1	fnmacs
+#define FMAC_I2 fmacs
+
+#endif
+
+#endif
+
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE ]
+	fldmiad		X!,  { d4 - d7  }
+	pld	[ Y, #X_PRE ]
+	fldmiad		Y ,  { d8 - d11 }
+	fmacd   	d8 , d0, d4
+	fstmiad		Y!, { d8 }
+	fmacd   	d9 , d0, d5
+	fstmiad		Y!, { d9 }
+	fmacd   	d10, d0, d6
+	fstmiad		Y!, { d10 }
+	fmacd   	d11, d0, d7
+	fstmiad		Y!, { d11 }
+
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmiad		X!,  { d4 }
+	fldmiad		Y ,  { d8 }
+	fmacd   	d8 , d0, d4
+	fstmiad		Y!, { d8 }
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad		X ,  { d4 }
+	fldmiad		Y ,  { d8 }
+	fmacd   	d8 , d0, d4
+	fstmiad		Y , { d8 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+#else
+
+.macro KERNEL_F4
+
+	fldmias		X!,  { s4 - s7  }
+	fldmias		Y ,  { s8 - s11 }
+	fmacs   	s8 , s0, s4
+	fstmias		Y!, { s8 }
+	fmacs   	s9 , s0, s5
+	fstmias		Y!, { s9 }
+	fmacs   	s10, s0, s6
+	fstmias		Y!, { s10 }
+	fmacs   	s11, s0, s7
+	fstmias		Y!, { s11 }
+
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmias		X!,  { s4 }
+	fldmias		Y ,  { s8 }
+	fmacs   	s8 , s0, s4
+	fstmias		Y!, { s8 }
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias		X ,  { s4 }
+	fldmias		Y ,  { s8 }
+	fmacs   	s8 , s0, s4
+	fstmias		Y , { s8 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE ]
+	fldmiad		X!,  { d4 - d7  }
+	pld	[ Y, #X_PRE ]
+	fldmiad		Y ,  { d8 - d11 }
+
+	FMAC_R1		d8 , d0, d4
+	FMAC_R2		d8 , d1, d5
+	FMAC_I1		d9 , d0, d5
+	FMAC_I2		d9 , d1, d4
+	fstmiad		Y!, { d8 }
+	fstmiad		Y!, { d9 }
+
+	FMAC_R1		d10, d0, d6
+	FMAC_R2		d10, d1, d7
+	FMAC_I1		d11, d0, d7
+	FMAC_I2		d11, d1, d6
+	fstmiad		Y!, { d10 }
+	fstmiad		Y!, { d11 }
+
+	pld	[ X, #X_PRE ]
+	fldmiad		X!,  { d4 - d7  }
+	pld	[ Y, #X_PRE ]
+	fldmiad		Y ,  { d8 - d11 }
+
+	FMAC_R1		d8 , d0, d4
+	FMAC_R2		d8 , d1, d5
+	FMAC_I1		d9 , d0, d5
+	FMAC_I2		d9 , d1, d4
+	fstmiad		Y!, { d8 }
+	fstmiad		Y!, { d9 }
+
+	FMAC_R1		d10, d0, d6
+	FMAC_R2		d10, d1, d7
+	FMAC_I1		d11, d0, d7
+	FMAC_I2		d11, d1, d6
+	fstmiad		Y!, { d10 }
+	fstmiad		Y!, { d11 }
+
+
+
+
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmiad		X!,  { d4 - d5  }
+	fldmiad		Y ,  { d8 - d9 }
+
+	FMAC_R1		d8 , d0, d4
+	FMAC_R2		d8 , d1, d5
+	FMAC_I1		d9 , d0, d5
+	FMAC_I2		d9 , d1, d4
+	fstmiad		Y!, { d8 }
+	fstmiad		Y!, { d9 }
+
+
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad		X ,  { d4 - d5 }
+	fldmiad		Y ,  { d8 - d9 }
+
+	FMAC_R1		d8 , d0, d4
+	FMAC_R2		d8 , d1, d5
+	FMAC_I1		d9 , d0, d5
+	FMAC_I2		d9 , d1, d4
+	fstmiad		Y  , { d8 - d9 }
+
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+
+#else
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE ]
+	fldmias		X!,  { s4 - s7  }
+	pld	[ Y, #X_PRE ]
+	fldmias		Y ,  { s8 - s11 }
+
+	FMAC_R1		s8 , s0, s4
+	FMAC_R2		s8 , s1, s5
+	FMAC_I1		s9 , s0, s5
+	FMAC_I2		s9 , s1, s4
+	fstmias		Y!, { s8 }
+	fstmias		Y!, { s9 }
+
+	FMAC_R1		s10, s0, s6
+	FMAC_R2		s10, s1, s7
+	FMAC_I1		s11, s0, s7
+	FMAC_I2		s11, s1, s6
+	fstmias		Y!, { s10 }
+	fstmias		Y!, { s11 }
+
+	fldmias		X!,  { s4 - s7  }
+	fldmias		Y ,  { s8 - s11 }
+
+	FMAC_R1		s8 , s0, s4
+	FMAC_R2		s8 , s1, s5
+	FMAC_I1		s9 , s0, s5
+	FMAC_I2		s9 , s1, s4
+	fstmias		Y!, { s8 }
+	fstmias		Y!, { s9 }
+
+	FMAC_R1		s10, s0, s6
+	FMAC_R2		s10, s1, s7
+	FMAC_I1		s11, s0, s7
+	FMAC_I2		s11, s1, s6
+	fstmias		Y!, { s10 }
+	fstmias		Y!, { s11 }
+
+
+
+
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmias		X!,  { s4 - s5  }
+	fldmias		Y ,  { s8 - s9 }
+
+	FMAC_R1		s8 , s0, s4
+	FMAC_R2		s8 , s1, s5
+	FMAC_I1		s9 , s0, s5
+	FMAC_I2		s9 , s1, s4
+	fstmias		Y!, { s8 }
+	fstmias		Y!, { s9 }
+
+
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias		X ,  { s4 - s5 }
+	fldmias		Y ,  { s8 - s9 }
+
+	FMAC_R1		s8 , s0, s4
+	FMAC_R2		s8 , s1, s5
+	FMAC_I1		s9 , s0, s5
+	FMAC_I2		s9 , s1, s4
+	fstmias		Y  , { s8 - s9 }
+
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 , fp}
+        add     fp, sp, #8
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+	ldr    INC_X , OLD_INC_X
+	ldr         Y, OLD_Y
+	ldr    INC_Y , OLD_INC_Y
+
+	sub     r12, fp, #128
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15}                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s15}                                 // store floating point registers
+#endif
+
+	cmp	N, #0
+	ble	axpy_kernel_L999
+
+	cmp	INC_X, #0
+	beq	axpy_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	axpy_kernel_L999
+
+	cmp	INC_X, #1
+	bne	axpy_kernel_S_BEGIN
+
+	cmp	INC_Y, #1
+	bne	axpy_kernel_S_BEGIN
+
+
+axpy_kernel_F_BEGIN:
+
+
+	asrs	I, N, #2					// I = N / 4
+	ble	axpy_kernel_F1
+
+	.align 5
+
+axpy_kernel_F4:
+
+#if !defined(COMPLEX) && !defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+	pld	[ Y, #X_PRE ]
+#endif
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	ble	axpy_kernel_F1
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	axpy_kernel_F4
+
+axpy_kernel_F1:
+
+	ands	I, N, #3
+	ble	axpy_kernel_L999
+
+axpy_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     axpy_kernel_F10
+
+	b	axpy_kernel_L999
+
+axpy_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
+#endif
+
+#endif
+
+
+	asrs	I, N, #2					// I = N / 4
+	ble	axpy_kernel_S1
+
+	.align 5
+
+axpy_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	axpy_kernel_S4
+
+axpy_kernel_S1:
+
+	ands	I, N, #3
+	ble	axpy_kernel_L999
+
+axpy_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     axpy_kernel_S10
+
+
+axpy_kernel_L999:
+
+	sub     r3, fp, #128
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s15 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #8
+	pop     {r4,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/ccopy_vfp.S
+++ b/kernel/arm/ccopy_vfp.S
@ -0,0 +1,222 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/07 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+#define	OLD_Y	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define OLD_INC_Y	[fp, #4 ]
+
+#define I	r5
+#define Y	r6
+#define INC_Y	r7
+
+#define X_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmias	X!, { s0 - s7 }
+	fstmias	Y!, { s0 - s7 }
+
+.endm
+
+.macro COPY_F1
+
+	fldmias	X!, { s0 - s1 }
+	fstmias	Y!, { s0 - s1 }
+
+.endm
+
+
+/*************************************************************************************************************************/
+
+.macro COPY_S4
+
+	nop
+	fldmias	X, { s0 - s1 }
+	fstmias	Y, { s0 - s1 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s2 - s3 }
+	fstmias	Y, { s2 - s3 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s0 - s1 }
+	fstmias	Y, { s0 - s1 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s2 - s3 }
+	fstmias	Y, { s2 - s3 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+.macro COPY_S1
+
+	fldmias	X, { s0 - s1 }
+	fstmias	Y, { s0 - s1 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	sub	r4, fp, #128
+	vstm	r4, { s8 - s15} 				// store floating point registers
+
+	mov	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+	
+	cmp	N, #0
+	ble	ccopy_kernel_L999
+
+	cmp	INC_X, #0
+	beq	ccopy_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	ccopy_kernel_L999
+
+	cmp	INC_X, #1
+	bne	ccopy_kernel_S_BEGIN
+
+	cmp	INC_Y, #1
+	bne	ccopy_kernel_S_BEGIN
+
+ccopy_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	ccopy_kernel_F1
+
+ccopy_kernel_F4:
+
+	COPY_F4
+
+	subs	I, I, #1
+	bne	ccopy_kernel_F4
+
+ccopy_kernel_F1:
+
+	ands	I, N, #3
+	ble	ccopy_kernel_L999
+
+ccopy_kernel_F10:
+
+	COPY_F1
+
+	subs    I, I, #1
+        bne     ccopy_kernel_F10
+
+	b	ccopy_kernel_L999
+
+ccopy_kernel_S_BEGIN:
+
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
+
+	asrs	I, N, #2					// I = N / 4
+	ble	ccopy_kernel_S1
+
+ccopy_kernel_S4:
+
+	COPY_S4
+
+	subs	I, I, #1
+	bne	ccopy_kernel_S4
+
+ccopy_kernel_S1:
+
+	ands	I, N, #3
+	ble	ccopy_kernel_L999
+
+ccopy_kernel_S10:
+
+	COPY_S1
+
+	subs    I, I, #1
+        bne     ccopy_kernel_S10
+
+
+
+
+
+
+ccopy_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s15}					// restore floating point registers
+
+	mov	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@ -0,0 +1,284 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/11 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+#define	OLD_Y	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define OLD_INC_Y	[fp, #4 ]
+
+#define I	r5
+#define Y	r6
+#define INC_Y	r7
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	pld	[ Y, #X_PRE  ]
+
+	fldmias	X!, { s4 - s5 }
+	fldmias	Y!, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fldmias	X!, { s6 - s7 }
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+
+	fldmias	Y!, { s10 - s11 }
+	fmacs   s0  , s6,  s10
+	fmacs   s1  , s6,  s11
+	fmacs   s2  , s7,  s11
+	fmacs   s3  , s7,  s10
+
+
+	fldmias	X!, { s4 - s5 }
+	fldmias	Y!, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fldmias	X!, { s6 - s7 }
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+
+	fldmias	Y!, { s10 - s11 }
+	fmacs   s0  , s6,  s10
+	fmacs   s1  , s6,  s11
+	fmacs   s2  , s7,  s11
+	fmacs   s3  , s7,  s10
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmias	X!, { s4 - s5 }
+	fldmias	Y!, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+
+.endm
+
+
+/*************************************************************************************************************************/
+
+.macro KERNEL_S4
+
+	nop
+
+	fldmias	X, { s4 - s5 }
+	fldmias	Y, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s4 - s5 }
+	fldmias	Y, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s4 - s5 }
+	fldmias	Y, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmias	X, { s4 - s5 }
+	fldmias	Y, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmias	X, { s4 - s5 }
+	fldmias	Y, { s8 - s9 }
+	fmacs   s0  , s4,  s8
+	fmacs   s1  , s4,  s9
+	fmacs   s2  , s5,  s9
+	fmacs   s3  , s5,  s8
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	sub	r4, fp, #128
+	vstm	r4, { s8 - s15} 				// store floating point registers
+
+	mov	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+	
+	vsub.f32                s0 , s0 , s0
+	vsub.f32                s1 , s1 , s1
+	vsub.f32                s2 , s2 , s2
+	vsub.f32                s3 , s3 , s3
+
+	cmp	N, #0
+	ble	cdot_kernel_L999
+
+	cmp	INC_X, #0
+	beq	cdot_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	cdot_kernel_L999
+
+	cmp	INC_X, #1
+	bne	cdot_kernel_S_BEGIN
+
+	cmp	INC_Y, #1
+	bne	cdot_kernel_S_BEGIN
+
+cdot_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	cdot_kernel_F1
+
+cdot_kernel_F4:
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	cdot_kernel_F4
+
+cdot_kernel_F1:
+
+	ands	I, N, #3
+	ble	cdot_kernel_L999
+
+cdot_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     cdot_kernel_F10
+
+	b	cdot_kernel_L999
+
+cdot_kernel_S_BEGIN:
+
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
+
+	asrs	I, N, #2					// I = N / 4
+	ble	cdot_kernel_S1
+
+cdot_kernel_S4:
+
+	KERNEL_S4
+
+	subs	I, I, #1
+	bne	cdot_kernel_S4
+
+cdot_kernel_S1:
+
+	ands	I, N, #3
+	ble	cdot_kernel_L999
+
+cdot_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     cdot_kernel_S10
+
+
+
+cdot_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s15}					// restore floating point registers
+
+#if !defined(CONJ)
+	vsub.f32	s0 , s0, s2				
+	vadd.f32	s1 , s1, s3				
+#else
+	vadd.f32	s0 , s0, s2				
+	vsub.f32	s1 , s1, s3				
+#endif
+
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
--- a/kernel/arm/cgemm_ncopy_2_vfp.S
+++ b/kernel/arm/cgemm_ncopy_2_vfp.S
@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/05 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_A	r2
+#define	OLD_LDA	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define LDA	[fp, #-260 ]
+
+#define B	[fp, #4 ]
+
+#define M	r0
+#define N	r1
+#define A	r2
+
+#define	BO	r5
+
+#define	AO1	r6
+#define	AO2	r7
+
+#define I	r3
+#define	J	r12
+
+#define A_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY2x2
+
+	flds	s0 , [ AO1, #0  ]
+	flds	s1 , [ AO1, #4  ]
+	flds	s4 , [ AO1, #8 ]
+	flds	s5 , [ AO1, #12 ]
+
+	flds	s2 , [ AO2, #0  ]
+	flds	s3 , [ AO2, #4  ]
+	add	AO1, AO1, #16
+	flds	s6 , [ AO2, #8 ]
+	flds	s7 , [ AO2, #12 ]
+
+	fstmias	BO!, { s0 - s7 }
+	add	AO2, AO2, #16
+
+.endm
+
+
+.macro COPY1x2
+
+	flds	s0 , [ AO1, #0  ]
+	flds	s1 , [ AO1, #4  ]
+	flds	s2 , [ AO2, #0  ]
+	flds	s3 , [ AO2, #4  ]
+
+	add	AO1, AO1, #8
+	fstmias	BO!, { s0 - s3 }
+	add	AO2, AO2, #8
+
+.endm
+
+.macro COPY2x1
+
+	flds	s0 , [ AO1, #0  ]
+	flds	s1 , [ AO1, #4  ]
+	flds	s2 , [ AO1, #8 ]
+	flds	s3 , [ AO1, #12 ]
+
+	fstmias	BO!, { s0 - s3 }
+	add	AO1, AO1, #16
+
+.endm
+
+
+.macro COPY1x1
+
+	flds	s0 , [ AO1, #0  ]
+	flds	s1 , [ AO1, #4  ]
+
+	fstmias	BO!, { s0 - s1 }
+	add	AO1, AO1, #8
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+
+	lsl	r3, r3, #3					// lda = lda * 4 * 2
+	str	r3, LDA
+
+	sub	r4, fp, #128
+	vstm	r4, { s8 - s15} 				// store floating point registers
+
+	ldr	BO, B	
+
+/*********************************************************************************************/
+
+cgemm_ncopy_L2_BEGIN:
+
+	asrs	J, N, #1					// J = N / 2
+	ble	cgemm_ncopy_L1_BEGIN
+
+cgemm_ncopy_L2_M2_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	ldr	r4 , LDA
+	add	AO2, AO1, r4
+	add	A  , AO2, r4 					// A = A + 2 * LDA
+
+	asrs	I, M, #1					// I = M / 2
+	ble	cgemm_ncopy_L2_M2_40
+
+cgemm_ncopy_L2_M2_20:
+
+	pld	[ AO1, #A_PRE ]
+	pld	[ AO2, #A_PRE ]
+
+	COPY2x2
+	subs	I , I , #1
+	ble	cgemm_ncopy_L2_M2_40
+
+	COPY2x2
+	subs	I , I , #1
+	bne	cgemm_ncopy_L2_M2_20
+	
+	
+cgemm_ncopy_L2_M2_40:
+
+	ands	I, M , #1
+	ble	cgemm_ncopy_L2_M2_END
+
+cgemm_ncopy_L2_M2_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	cgemm_ncopy_L2_M2_60
+	
+
+cgemm_ncopy_L2_M2_END:
+
+	subs    J , J, #1                                               // j--
+        bne     cgemm_ncopy_L2_M2_BEGIN
+
+
+/*********************************************************************************************/
+
+cgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	cgemm_ncopy_L999
+
+
+cgemm_ncopy_L1_M2_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	ldr	r4 , LDA
+	add	A  , AO1, r4 					// A = A + 1 * LDA
+
+	asrs	I, M, #1					// I = M / 2
+	ble	cgemm_ncopy_L1_M2_40
+
+cgemm_ncopy_L1_M2_20:
+
+	COPY2x1
+
+	subs	I , I , #1
+	bne	cgemm_ncopy_L1_M2_20
+	
+	
+cgemm_ncopy_L1_M2_40:
+
+	ands	I, M , #1
+	ble	cgemm_ncopy_L1_M2_END
+
+cgemm_ncopy_L1_M2_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	cgemm_ncopy_L1_M2_60
+	
+
+cgemm_ncopy_L1_M2_END:
+
+
+
+cgemm_ncopy_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s15}					// restore floating point registers
+
+	movs	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/cgemm_tcopy_2_vfp.S
+++ b/kernel/arm/cgemm_tcopy_2_vfp.S
@ -0,0 +1,243 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/07 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_A	r2
+#define	OLD_LDA	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define B	[fp, #4 ]
+#define A	[fp, #-248 ]
+
+#define M	r0
+#define N	r1
+#define M4	r2
+
+#define	LDA	r5
+
+#define	AO1	r6
+#define	BO1	r7
+#define	BO2	r8
+
+#define I	r4
+#define	J	r12
+
+#define A_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro COPY2x2
+
+	fldmias	AO1, { s0 - s3 }
+
+	add	r3, AO1, LDA
+	fldmias	r3, { s4 - s7 }
+
+	fstmias	BO1, { s0 - s7 }
+	add	AO1, AO1, #16
+	add	BO1, BO1, M4
+
+.endm
+
+.macro COPY1x2
+
+	fldmias	AO1, { s0 -s1 }
+
+	add	r3, AO1, LDA
+	fldmias	r3, { s2 - s3 }
+
+	fstmias	BO2, { s0 - s3 }
+	add	AO1, AO1, #8
+	add	BO2, BO2, #16
+
+.endm
+
+/*************************************************************************************************************************/
+.macro COPY2x1
+
+	fldmias	AO1, { s0 - s3 }
+
+	fstmias	BO1, { s0 - s3 }
+	add	AO1, AO1, #16
+	add	BO1, BO1, M4
+
+.endm
+
+.macro COPY1x1
+
+	fldmias	AO1, { s0 - s1 }
+
+	fstmias	BO2, { s0 - s1 }
+	add	AO1, AO1, #8
+	add	BO2, BO2, #8
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	str	OLD_A, A					// store A
+
+	lsl	LDA, OLD_LDA, #3				// lda = lda * SIZE * 2
+
+	sub	r4, fp, #128
+	vstm	r4, { s8 - s15} 				// store floating point registers
+
+	lsl	r4 , M, #3					// M * SIZE * 2
+
+	ldr	r3, B
+
+	and	BO2 , N , #-2
+
+	mul	BO2, BO2, r4
+
+	add	BO2 , BO2, r3
+
+	lsl	M4, M, #4					// M4 = M * 2 * SIZE * 2
+
+cgemm_tcopy_L2_BEGIN:
+
+	asrs	J, M, #1					// J = N / 2
+	ble	cgemm_tcopy_L1_BEGIN
+
+cgemm_tcopy_L2_M2_BEGIN:
+
+	ldr	AO1, A						// AO1 = A
+	lsl	r3, LDA, #1					// r3 = 2 * LDA
+	add	r3, r3 , AO1					// A = A + 2 * LDA
+	str	r3, A						// store A
+
+	ldr	BO1, B
+	add	r3, BO1, #32					// B = B + 4 * SIZE *2
+	str	r3, B
+
+	asrs	I, N, #1					// I = M / 2
+	ble	cgemm_tcopy_L2_M2_60
+
+cgemm_tcopy_L2_M2_40:
+
+	COPY2x2
+	subs I, I, #1
+	bne	cgemm_tcopy_L2_M2_40
+
+cgemm_tcopy_L2_M2_60:
+
+	tst	N , #1
+	ble	cgemm_tcopy_L2_M2_END
+
+	COPY1x2
+
+
+cgemm_tcopy_L2_M2_END:
+
+	subs	J , J, #1						// j--
+	bne	cgemm_tcopy_L2_M2_BEGIN
+
+/*********************************************************************************************/
+
+cgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	cgemm_tcopy_L999
+
+
+cgemm_tcopy_L1_M2_BEGIN:
+
+	ldr	AO1, A						// AO1 = A
+	add	r3, LDA , AO1					// A = A + 1 * LDA
+	str	r3, A						// store A
+
+	ldr	BO1, B
+	add	r3, BO1, #16					// B = B + 2 * SIZE *2
+	str	r3, B
+
+	asrs	I, N, #1					// I = M / 2
+	ble	cgemm_tcopy_L1_M2_60
+
+
+cgemm_tcopy_L1_M2_40:
+
+	COPY2x1
+	subs I, I, #1
+	bne	cgemm_tcopy_L1_M2_40
+
+cgemm_tcopy_L1_M2_60:
+
+	tst	N , #1
+	ble	cgemm_tcopy_L1_M2_END
+
+	COPY1x1
+
+
+cgemm_tcopy_L1_M2_END:
+
+
+
+cgemm_tcopy_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s15}					// restore floating point registers
+
+	mov	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@ -0,0 +1,697 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_M		r0
+
+#define AO1	r0
+#define N	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define ALPHA_I [fp, #-236]
+#define ALPHA_R [fp, #-244]
+
+#define M	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	64
+#define Y_PRE	0
+#define A_PRE	0
+
+/**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+        #define KMAC_R  fnmacs
+        #define KMAC_I  fmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fnmacs
+        #define FMAC_I1 fmacs
+        #define FMAC_I2 fmacs
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+        #define KMAC_R  fmacs
+        #define KMAC_I  fnmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fnmacs
+        #define FMAC_I1 fmacs
+        #define FMAC_I2 fmacs
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+        #define KMAC_R  fmacs
+        #define KMAC_I  fnmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fmacs
+        #define FMAC_I1 fnmacs
+        #define FMAC_I2 fmacs
+
+#else
+
+        #define KMAC_R  fnmacs
+        #define KMAC_I  fmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fmacs
+        #define FMAC_I1 fnmacs
+        #define FMAC_I2 fmacs
+
+#endif
+
+.macro INIT_F4
+
+	pld	[ YO, #Y_PRE ]
+        vsub.f32                s8 , s8 , s8
+        vmov.f32                s9 , s8
+        vmov.f32                s10, s8
+        vmov.f32                s11, s8
+        vmov.f32                s12, s8
+        vmov.f32                s13, s8
+        vmov.f32                s14, s8
+        vmov.f32                s15, s8
+
+.endm
+
+.macro KERNEL_F4X4
+
+	pld	[ XO, #X_PRE ]
+	KERNEL_F4X1
+	KERNEL_F4X1
+	KERNEL_F4X1
+	KERNEL_F4X1
+
+.endm
+
+.macro KERNEL_F4X1
+
+	pld	[ AO2, #A_PRE ]
+        flds    s0 , [ AO1 ]
+        flds    s1 , [ AO1, #4  ]
+        flds    s2 , [ AO1, #8 ]
+        flds    s3 , [ AO1, #12 ]
+
+        flds    s4 , [ XO ]
+        flds    s5 , [ XO, #4 ]
+
+        fmacs   s8  , s0,  s4
+        fmacs   s9  , s0,  s5
+        fmacs   s10 , s2,  s4
+        fmacs   s11 , s2,  s5
+
+        KMAC_R  s8  , s1,  s5
+        KMAC_I  s9  , s1,  s4
+        KMAC_R  s10 , s3,  s5
+        KMAC_I  s11 , s3,  s4
+
+        flds    s0 , [ AO1, #16 ]
+        flds    s1 , [ AO1, #20 ]
+        flds    s2 , [ AO1, #24 ]
+        flds    s3 , [ AO1, #28 ]
+
+        fmacs   s12 , s0,  s4
+        fmacs   s13 , s0,  s5
+        fmacs   s14 , s2,  s4
+        fmacs   s15 , s2,  s5
+
+        KMAC_R  s12 , s1,  s5
+        KMAC_I  s13 , s1,  s4
+        KMAC_R  s14 , s3,  s5
+        KMAC_I  s15 , s3,  s4
+
+        add     XO , XO, #8
+        add     AO1 , AO1, LDA
+        add     AO2 , AO2, LDA
+
+.endm
+
+.macro SAVE_F4
+
+        flds            s0, ALPHA_R
+        flds            s1, ALPHA_I
+
+        fldmias YO, { s4 - s7 }
+
+        FMAC_R1 s4 , s0 , s8
+        FMAC_I1 s5 , s0 , s9
+        FMAC_R2 s4 , s1 , s9
+        FMAC_I2 s5 , s1 , s8
+
+        FMAC_R1 s6 , s0 , s10
+        FMAC_I1 s7 , s0 , s11
+        FMAC_R2 s6 , s1 , s11
+        FMAC_I2 s7 , s1 , s10
+
+        fstmias YO!, { s4 - s7 }
+
+        fldmias YO, { s4 - s7 }
+
+        FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+        FMAC_R1 s6 , s0 , s14
+        FMAC_I1 s7 , s0 , s15
+        FMAC_R2 s6 , s1 , s15
+        FMAC_I2 s7 , s1 , s14
+
+        fstmias YO!, { s4 - s7 }
+
+.endm
+
+
+
+
+.macro INIT_F1
+
+        vsub.f32                s8 , s8 , s8
+        vmov.f32                s9 , s8
+
+.endm
+
+.macro KERNEL_F1X1
+
+        flds    s0 , [ AO1 ]
+        flds    s1 , [ AO1, #4 ]
+
+        flds    s4 , [ XO ]
+        flds    s5 , [ XO, #4 ]
+
+        fmacs   s8  , s0,  s4
+        fmacs   s9  , s0,  s5
+
+        KMAC_R  s8  , s1,  s5
+        KMAC_I  s9  , s1,  s4
+
+        add     XO , XO, #8
+        add     AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_F1
+
+        flds            s0, ALPHA_R
+        flds            s1, ALPHA_I
+
+        fldmias YO, { s4 - s5 }
+
+        FMAC_R1 s4 , s0 , s8
+        FMAC_I1 s5 , s0 , s9
+        FMAC_R2 s4 , s1 , s9
+        FMAC_I2 s5 , s1 , s8
+
+        fstmias YO, { s4 - s5 }
+
+        add     YO, YO, #8
+
+.endm
+
+/****************************************************************************************/
+
+.macro INIT_S4
+
+        vsub.f32                s8 , s8 , s8
+        vmov.f32                s9 , s8
+        vmov.f32                s10, s8
+        vmov.f32                s11, s8
+        vmov.f32                s12, s8
+        vmov.f32                s13, s8
+        vmov.f32                s14, s8
+        vmov.f32                s15, s8
+
+.endm
+
+.macro KERNEL_S4X4
+
+	KERNEL_S4X1
+	KERNEL_S4X1
+	KERNEL_S4X1
+	KERNEL_S4X1
+
+.endm
+
+.macro KERNEL_S4X1
+
+        flds    s0 , [ AO1 ]
+        flds    s1 , [ AO1, #4  ]
+        flds    s2 , [ AO1, #8 ]
+        flds    s3 , [ AO1, #12 ]
+
+        flds    s4 , [ XO ]
+        flds    s5 , [ XO, #4 ]
+
+        fmacs   s8  , s0,  s4
+        fmacs   s9  , s0,  s5
+        fmacs   s10 , s2,  s4
+        fmacs   s11 , s2,  s5
+
+        KMAC_R  s8  , s1,  s5
+        KMAC_I  s9  , s1,  s4
+        KMAC_R  s10 , s3,  s5
+        KMAC_I  s11 , s3,  s4
+
+        flds    s0 , [ AO1, #16 ]
+        flds    s1 , [ AO1, #20 ]
+        flds    s2 , [ AO1, #24 ]
+        flds    s3 , [ AO1, #28 ]
+
+        fmacs   s12 , s0,  s4
+        fmacs   s13 , s0,  s5
+        fmacs   s14 , s2,  s4
+        fmacs   s15 , s2,  s5
+
+        KMAC_R  s12 , s1,  s5
+        KMAC_I  s13 , s1,  s4
+        KMAC_R  s14 , s3,  s5
+        KMAC_I  s15 , s3,  s4
+
+        add     XO , XO, INC_X
+        add     AO1 , AO1, LDA
+        add     AO2 , AO2, LDA
+
+.endm
+
+.macro SAVE_S4
+
+        flds            s0, ALPHA_R
+        flds            s1, ALPHA_I
+
+        fldmias YO, { s4 - s5 }
+
+        FMAC_R1 s4 , s0 , s8
+        FMAC_I1 s5 , s0 , s9
+        FMAC_R2 s4 , s1 , s9
+        FMAC_I2 s5 , s1 , s8
+
+        fstmias YO, { s4 - s5 }
+
+	add	YO, YO, INC_Y
+
+        fldmias YO, { s6 - s7 }
+
+        FMAC_R1 s6 , s0 , s10
+        FMAC_I1 s7 , s0 , s11
+        FMAC_R2 s6 , s1 , s11
+        FMAC_I2 s7 , s1 , s10
+
+        fstmias YO, { s6 - s7 }
+
+	add	YO, YO, INC_Y
+
+        fldmias YO, { s4 - s5 }
+
+        FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+        fstmias YO, { s4 - s5 }
+
+	add	YO, YO, INC_Y
+
+        fldmias YO, { s6 - s7 }
+
+        FMAC_R1 s6 , s0 , s14
+        FMAC_I1 s7 , s0 , s15
+        FMAC_R2 s6 , s1 , s15
+        FMAC_I2 s7 , s1 , s14
+
+        fstmias YO, { s6 - s7 }
+
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+
+.macro INIT_S1
+
+        vsub.f32                s8 , s8 , s8
+        vmov.f32                s9 , s8
+
+.endm
+
+.macro KERNEL_S1X1
+
+        flds    s0 , [ AO1 ]
+        flds    s1 , [ AO1, #4 ]
+
+        flds    s4 , [ XO ]
+        flds    s5 , [ XO, #4 ]
+
+        fmacs   s8  , s0,  s4
+        fmacs   s9  , s0,  s5
+
+        KMAC_R  s8  , s1,  s5
+        KMAC_I  s9  , s1,  s4
+
+        add     XO , XO, INC_X
+        add     AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_S1
+
+        flds            s0, ALPHA_R
+        flds            s1, ALPHA_I
+
+        fldmias YO, { s4 - s5 }
+
+        FMAC_R1 s4 , s0 , s8
+        FMAC_I1 s5 , s0 , s9
+        FMAC_R2 s4 , s1 , s9
+        FMAC_I2 s5 , s1 , s8
+
+        fstmias YO, { s4 - s5 }
+
+        add     YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s15 }                                 // store floating point registers
+#endif
+
+	cmp	OLD_M, #0
+	ble	cgemvn_kernel_L999
+
+	cmp	N, #0
+	ble	cgemvn_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_M, M
+	vstr    s0 , ALPHA_R
+        vstr    s1 , ALPHA_I
+
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	cgemvn_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	cgemvn_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #4				// LDA * SIZE * 2
+#else
+	lsl	LDA, LDA, #3				// LDA * SIZE * 2
+#endif
+
+	cmp	INC_X, #1
+	bne	cgemvn_kernel_S4_BEGIN
+
+	cmp	INC_Y, #1
+	bne	cgemvn_kernel_S4_BEGIN
+
+
+cgemvn_kernel_F4_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #2					// I = M / 4
+	ble	cgemvn_kernel_F1_BEGIN
+
+cgemvn_kernel_F4X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #32
+	str	r3 , A
+
+	add	AO2, AO2, LDA
+	add	AO2, AO2, LDA
+
+	ldr	XO , X
+
+	INIT_F4
+
+	asrs	J, N, #2					// J = N / 4
+	ble	cgemvn_kernel_F4X1
+
+
+cgemvn_kernel_F4X4_10:
+
+	KERNEL_F4X4
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_F4X4_10
+
+
+cgemvn_kernel_F4X1:
+
+	ands	J, N , #3
+	ble	cgemvn_kernel_F4_END
+
+cgemvn_kernel_F4X1_10:
+
+	KERNEL_F4X1
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_F4X1_10
+
+
+cgemvn_kernel_F4_END:
+
+	SAVE_F4
+
+	subs	I , I , #1
+	bne	cgemvn_kernel_F4X4
+
+
+cgemvn_kernel_F1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #3
+	ble	cgemvn_kernel_L999
+
+cgemvn_kernel_F1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #8
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_F1
+
+	mov	J, N
+
+
+cgemvn_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_F1X1_10
+
+
+cgemvn_kernel_F1_END:
+
+	SAVE_F1
+
+	subs	I , I , #1
+	bne	cgemvn_kernel_F1X1
+
+	b	cgemvn_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+cgemvn_kernel_S4_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
+#endif
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #2					// I = M / 4
+	ble	cgemvn_kernel_S1_BEGIN
+
+cgemvn_kernel_S4X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #32
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S4
+
+	asrs	J, N, #2					// J = N / 4
+	ble	cgemvn_kernel_S4X1
+
+
+cgemvn_kernel_S4X4_10:
+
+	KERNEL_S4X4
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_S4X4_10
+
+
+cgemvn_kernel_S4X1:
+
+	ands	J, N , #3
+	ble	cgemvn_kernel_S4_END
+
+cgemvn_kernel_S4X1_10:
+
+	KERNEL_S4X1
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_S4X1_10
+
+
+cgemvn_kernel_S4_END:
+
+	SAVE_S4
+
+	subs	I , I , #1
+	bne	cgemvn_kernel_S4X4
+
+
+cgemvn_kernel_S1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #3
+	ble	cgemvn_kernel_L999
+
+cgemvn_kernel_S1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #8
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_S1
+
+	mov	J, N
+
+
+cgemvn_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	J, J, #1
+	bne	cgemvn_kernel_S1X1_10
+
+
+cgemvn_kernel_S1_END:
+
+	SAVE_S1
+
+	subs	I , I , #1
+	bne	cgemvn_kernel_S1X1
+
+
+/*************************************************************************************************************/
+
+cgemvn_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s15 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@ -0,0 +1,607 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_N		r1
+
+#define M	r0
+#define AO1	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define N	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	512
+#define A_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+        #define KMAC_R  fnmacs
+        #define KMAC_I  fmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fnmacs
+        #define FMAC_I1 fmacs
+        #define FMAC_I2 fmacs
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+        #define KMAC_R  fmacs
+        #define KMAC_I  fnmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fnmacs
+        #define FMAC_I1 fmacs
+        #define FMAC_I2 fmacs
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+        #define KMAC_R  fmacs
+        #define KMAC_I  fnmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fmacs
+        #define FMAC_I1 fnmacs
+        #define FMAC_I2 fmacs
+
+#else
+
+        #define KMAC_R  fnmacs
+        #define KMAC_I  fmacs
+
+        #define FMAC_R1 fmacs
+        #define FMAC_R2 fmacs
+        #define FMAC_I1 fnmacs
+        #define FMAC_I2 fmacs
+
+#endif
+
+
+
+.macro INIT_F2
+
+	vsub.f32	s12, s12, s12
+	vsub.f32	s13, s13, s13
+	vsub.f32	s14, s14, s14
+	vsub.f32	s15, s15, s15
+
+.endm
+
+.macro KERNEL_F2X4
+
+	KERNEL_F2X1
+	KERNEL_F2X1
+	KERNEL_F2X1
+	KERNEL_F2X1
+
+.endm
+
+.macro KERNEL_F2X1
+
+	fldmias	XO! ,  { s2 - s3 }
+	fldmias	AO1!,  { s4 - s5 }
+	fldmias	AO2!,  { s8 - s9   }
+
+	fmacs	s12 , s4 , s2
+	fmacs	s13 , s4 , s3
+	KMAC_R  s12 , s5 , s3
+        KMAC_I  s13 , s5 , s2
+
+	fmacs	s14 , s8 , s2
+	fmacs	s15 , s8 , s3
+        KMAC_R  s14 , s9 , s3
+        KMAC_I  s15 , s9 , s2
+
+.endm
+
+.macro	SAVE_F2
+
+	fldmias	YO,  { s4 - s7 }
+
+	FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+        FMAC_R1 s6 , s0 , s14
+        FMAC_I1 s7 , s0 , s15
+        FMAC_R2 s6 , s1 , s15
+        FMAC_I2 s7 , s1 , s14
+
+	fstmias	YO!, { s4 - s7 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_F1
+
+	vsub.f32	s12, s12, s12
+	vsub.f32	s13, s13, s13
+
+.endm
+
+.macro KERNEL_F1X4
+
+	KERNEL_F1X1
+	KERNEL_F1X1
+	KERNEL_F1X1
+	KERNEL_F1X1
+
+.endm
+
+.macro KERNEL_F1X1
+
+	fldmias	XO! ,  { s2 - s3 }
+	fldmias	AO1!,  { s4 - s5 }
+
+	fmacs	s12 , s4 , s2
+	fmacs	s13 , s4 , s3
+	KMAC_R  s12 , s5 , s3
+        KMAC_I  s13 , s5 , s2
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmias	YO,  { s4 - s5 }
+
+	FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+	fstmias	YO!, { s4 - s5 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S2
+
+	vsub.f32	s12, s12, s12
+	vsub.f32	s13, s13, s13
+	vsub.f32	s14, s14, s14
+	vsub.f32	s15, s15, s15
+
+.endm
+
+.macro KERNEL_S2X4
+
+	KERNEL_S2X1
+	KERNEL_S2X1
+	KERNEL_S2X1
+	KERNEL_S2X1
+
+.endm
+
+.macro KERNEL_S2X1
+
+	fldmias	XO  ,  { s2 - s3 }
+	fldmias	AO1!,  { s4 - s5 }
+	fldmias	AO2!,  { s8 - s9   }
+
+	fmacs	s12 , s4 , s2
+	fmacs	s13 , s4 , s3
+	KMAC_R  s12 , s5 , s3
+        KMAC_I  s13 , s5 , s2
+
+	fmacs	s14 , s8 , s2
+	fmacs	s15 , s8 , s3
+        KMAC_R  s14 , s9 , s3
+        KMAC_I  s15 , s9 , s2
+
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S2
+
+	fldmias	YO,  { s4 - s5 }
+
+	FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+	fstmias	YO,  { s4 - s5 }
+
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s6 - s7 }
+
+        FMAC_R1 s6 , s0 , s14
+        FMAC_I1 s7 , s0 , s15
+        FMAC_R2 s6 , s1 , s15
+        FMAC_I2 s7 , s1 , s14
+
+	fstmias	YO,  { s6 - s7 }
+
+	add	YO, YO, INC_Y
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S1
+
+	vsub.f32	s12, s12, s12
+	vsub.f32	s13, s13, s13
+
+.endm
+
+.macro KERNEL_S1X4
+
+	KERNEL_S1X1
+	KERNEL_S1X1
+	KERNEL_S1X1
+	KERNEL_S1X1
+
+.endm
+
+.macro KERNEL_S1X1
+
+	fldmias	XO  ,  { s2 - s3 }
+	fldmias	AO1!,  { s4 - s5 }
+
+	fmacs	s12 , s4 , s2
+	fmacs	s13 , s4 , s3
+	KMAC_R  s12 , s5 , s3
+        KMAC_I  s13 , s5 , s2
+
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmias	YO,  { s4 - s5 }
+
+	FMAC_R1 s4 , s0 , s12
+        FMAC_I1 s5 , s0 , s13
+        FMAC_R2 s4 , s1 , s13
+        FMAC_I2 s5 , s1 , s12
+
+	fstmias	YO,  { s4 - s5 }
+
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s15 }                                 // store floating point registers
+#endif
+
+	cmp	M, #0
+	ble	cgemvt_kernel_L999
+
+	cmp	OLD_N, #0
+	ble	cgemvt_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_N, N
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	cgemvt_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	cgemvt_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #4				// LDA * SIZE
+#else
+	lsl	LDA, LDA, #3				// LDA * SIZE
+#endif
+
+	cmp	INC_X, #1
+	bne	cgemvt_kernel_S2_BEGIN
+
+	cmp	INC_Y, #1
+	bne	cgemvt_kernel_S2_BEGIN
+
+
+cgemvt_kernel_F2_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	cgemvt_kernel_F1_BEGIN
+
+cgemvt_kernel_F2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_F2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	cgemvt_kernel_F2X1
+
+
+cgemvt_kernel_F2X4_10:
+
+	KERNEL_F2X4
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_F2X4_10
+
+
+cgemvt_kernel_F2X1:
+
+	ands	I, M , #3
+	ble	cgemvt_kernel_F2_END
+
+cgemvt_kernel_F2X1_10:
+
+	KERNEL_F2X1
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_F2X1_10
+
+
+cgemvt_kernel_F2_END:
+
+	SAVE_F2
+
+	subs	J , J , #1
+	bne	cgemvt_kernel_F2X4
+
+
+cgemvt_kernel_F1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	cgemvt_kernel_L999
+
+cgemvt_kernel_F1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_F1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	cgemvt_kernel_F1X1
+
+
+cgemvt_kernel_F1X4_10:
+
+	KERNEL_F1X4
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_F1X4_10
+
+
+cgemvt_kernel_F1X1:
+
+	ands	I, M , #3
+	ble	cgemvt_kernel_F1_END
+
+cgemvt_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_F1X1_10
+
+
+cgemvt_kernel_F1_END:
+
+	SAVE_F1
+
+	b	cgemvt_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+cgemvt_kernel_S2_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#endif
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	cgemvt_kernel_S1_BEGIN
+
+cgemvt_kernel_S2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	cgemvt_kernel_S2X1
+
+
+cgemvt_kernel_S2X4_10:
+
+	KERNEL_S2X4
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_S2X4_10
+
+
+cgemvt_kernel_S2X1:
+
+	ands	I, M , #3
+	ble	cgemvt_kernel_S2_END
+
+cgemvt_kernel_S2X1_10:
+
+	KERNEL_S2X1
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_S2X1_10
+
+
+cgemvt_kernel_S2_END:
+
+	SAVE_S2
+
+	subs	J , J , #1
+	bne	cgemvt_kernel_S2X4
+
+
+cgemvt_kernel_S1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	cgemvt_kernel_L999
+
+cgemvt_kernel_S1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_S1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	cgemvt_kernel_S1X1
+
+
+cgemvt_kernel_S1X4_10:
+
+	KERNEL_S1X4
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_S1X4_10
+
+
+cgemvt_kernel_S1X1:
+
+	ands	I, M , #3
+	ble	cgemvt_kernel_S1_END
+
+cgemvt_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	I, I, #1
+	bne	cgemvt_kernel_S1X1_10
+
+
+cgemvt_kernel_S1_END:
+
+	SAVE_S1
+
+
+
+/*************************************************************************************************************/
+
+cgemvt_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s15 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/copy.c
+++ b/kernel/arm/copy.c
@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		y[iy] = x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+	
+
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
--- a/kernel/arm/dcopy_vfp.S
+++ b/kernel/arm/dcopy_vfp.S
@ -0,0 +1,222 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/07 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+#define	OLD_Y	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define OLD_INC_Y	[fp, #4 ]
+
+#define I	r5
+#define Y	r6
+#define INC_Y	r7
+
+#define X_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmiad	X!, { d0 - d3 }
+	fstmiad	Y!, { d0 - d3 }
+
+.endm
+
+.macro COPY_F1
+
+	fldmiad	X!, { d0 }
+	fstmiad	Y!, { d0 }
+
+.endm
+
+
+/*************************************************************************************************************************/
+
+.macro COPY_S4
+
+	nop
+	fldmiad	X, { d0 }
+	fstmiad	Y, { d0 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmiad	X, { d1 }
+	fstmiad	Y, { d1 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmiad	X, { d0 }
+	fstmiad	Y, { d0 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+	fldmiad	X, { d1 }
+	fstmiad	Y, { d1 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+.macro COPY_S1
+
+	fldmiad	X, { d0 }
+	fstmiad	Y, { d0 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	sub	r4, fp, #128
+	vstm	r4, { d8 - d15} 				// store floating point registers
+
+	mov	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+	
+	cmp	N, #0
+	ble	dcopy_kernel_L999
+
+	cmp	INC_X, #0
+	beq	dcopy_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	dcopy_kernel_L999
+
+	cmp	INC_X, #1
+	bne	dcopy_kernel_S_BEGIN
+
+	cmp	INC_Y, #1
+	bne	dcopy_kernel_S_BEGIN
+
+dcopy_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	dcopy_kernel_F1
+
+dcopy_kernel_F4:
+
+	COPY_F4
+
+	subs	I, I, #1
+	bne	dcopy_kernel_F4
+
+dcopy_kernel_F1:
+
+	ands	I, N, #3
+	ble	dcopy_kernel_L999
+
+dcopy_kernel_F10:
+
+	COPY_F1
+
+	subs    I, I, #1
+        bne     dcopy_kernel_F10
+
+	b	dcopy_kernel_L999
+
+dcopy_kernel_S_BEGIN:
+
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+
+	asrs	I, N, #2					// I = N / 4
+	ble	dcopy_kernel_S1
+
+dcopy_kernel_S4:
+
+	COPY_S4
+
+	subs	I, I, #1
+	bne	dcopy_kernel_S4
+
+dcopy_kernel_S1:
+
+	ands	I, N, #3
+	ble	dcopy_kernel_L999
+
+dcopy_kernel_S10:
+
+	COPY_S1
+
+	subs    I, I, #1
+        bne     dcopy_kernel_S10
+
+
+
+
+
+
+dcopy_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	mov	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@ -0,0 +1,248 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/11 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+#define	OLD_Y	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define OLD_INC_Y	[fp, #4 ]
+
+#define I	r5
+#define Y	r6
+#define INC_Y	r7
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	fldmiad	X!, { d8 }
+	pld	[ Y, #X_PRE  ]
+	fldmiad	Y!, { d4 }
+	fldmiad	Y!, { d5 }
+	fmacd   d0  , d4,  d8
+	fldmiad	X!, { d9 }
+	fldmiad	Y!, { d6 }
+	fmacd   d1  , d5,  d9
+	fldmiad	X!, { d10 }
+	fldmiad	X!, { d11 }
+	fmacd   d0  , d6,  d10
+	fldmiad	Y!, { d7 }
+	fmacd   d1  , d7,  d11
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmiad	X!, { d4 }
+	fldmiad	Y!, { d8 }
+	fmacd   d0  , d4,  d8
+
+.endm
+
+
+/*************************************************************************************************************************/
+
+.macro KERNEL_S4
+
+	nop
+	fldmiad	X, { d4 }
+	fldmiad	Y, { d8 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+	fmacd   d0  , d4,  d8
+
+	fldmiad	X, { d5 }
+	fldmiad	Y, { d9 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+	fmacd   d1  , d5,  d9
+
+	fldmiad	X, { d6 }
+	fldmiad	Y, { d10 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+	fmacd   d0  , d6,  d10
+
+	fldmiad	X, { d7 }
+	fldmiad	Y, { d11 }
+	add	X, X, INC_X
+	add	Y, Y, INC_Y
+	fmacd   d1  , d7,  d11
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmiad	X, { d4 }
+	fldmiad	Y, { d8 }
+	add	X, X, INC_X
+	fmacd   d0  , d4,  d8
+	add	Y, Y, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	sub	r4, fp, #128
+	vstm	r4, { d8 - d15} 				// store floating point registers
+
+	mov	Y, OLD_Y
+	ldr	INC_Y, OLD_INC_Y
+	
+	vsub.f64                d0 , d0 , d0
+	vsub.f64                d1 , d1 , d1
+
+	cmp	N, #0
+	ble	ddot_kernel_L999
+
+	cmp	INC_X, #0
+	beq	ddot_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	ddot_kernel_L999
+
+	cmp	INC_X, #1
+	bne	ddot_kernel_S_BEGIN
+
+	cmp	INC_Y, #1
+	bne	ddot_kernel_S_BEGIN
+
+ddot_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	ddot_kernel_F1
+
+ddot_kernel_F4:
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	ble	ddot_kernel_F1
+
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	ddot_kernel_F4
+
+ddot_kernel_F1:
+
+	ands	I, N, #3
+	ble	ddot_kernel_L999
+
+ddot_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     ddot_kernel_F10
+
+	b	ddot_kernel_L999
+
+ddot_kernel_S_BEGIN:
+
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+
+	asrs	I, N, #2					// I = N / 4
+	ble	ddot_kernel_S1
+
+ddot_kernel_S4:
+
+	KERNEL_S4
+
+	subs	I, I, #1
+	bne	ddot_kernel_S4
+
+ddot_kernel_S1:
+
+	ands	I, N, #3
+	ble	ddot_kernel_L999
+
+ddot_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     ddot_kernel_S10
+
+
+
+
+
+
+ddot_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	vadd.f64	d0 , d0, d1				// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@ -0,0 +1,806 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/27 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_K	r2
+#define	OLD_A	r3
+#define OLD_ALPHA d0
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define LDC	[fp, #-252 ]
+#define M	[fp, #-256 ]
+#define N	[fp, #-260 ]
+#define K	[fp, #-264 ]
+#define A	[fp, #-268 ]
+
+#define ALPHA	[fp, #-280]
+
+#define B	[fp, #4 ]
+#define C	[fp, #8 ]
+#define OLD_LDC	[fp, #12 ]
+
+#define I	r0
+#define J	r1
+#define L	r2
+
+#define	AO	r5
+#define	BO	r6
+
+#define	CO1	r8
+#define	CO2	r9
+
+#define K1	r7
+#define BC	r12
+
+#define A_PRE	96
+#define B_PRE	96
+#define C_PRE	32
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x2
+
+	vsub.f64		d8 , d8 , d8
+	vmov.f64		d9, d8
+	vmov.f64		d10, d8
+	vmov.f64		d11, d8
+	vmov.f64		d12, d8
+	vmov.f64		d13, d8
+	vmov.f64		d14, d8
+	vmov.f64		d15, d8
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+	pld	[ AO, #A_PRE ]
+	fldd	d4 , [ BO ]
+
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+
+	fmacd	d8  , d0,  d4
+	fldd	d2 , [ AO, #16 ]
+	fmacd	d9  , d1,  d4
+	fldd	d3 , [ AO, #24 ]
+	fmacd	d10  , d2,  d4
+	fldd	d5 , [ BO, #8 ]
+	fmacd	d11  , d3,  d4
+
+	fmacd	d12  , d0,  d5
+	fmacd	d13  , d1,  d5
+	add	AO , AO, #32
+	fmacd	d14  , d2,  d5
+	add	BO , BO, #16
+	fmacd	d15  , d3,  d5
+
+
+.endm
+
+.macro SAVE4x2
+
+	ldr	r3  , LDC
+	add	CO2 , CO1, r3
+
+	fldd		d0, ALPHA
+
+
+	fldd	d4 , [CO1]
+	fldd	d5 , [CO1, #8 ]
+		
+	pld	[ CO1, #C_PRE ]
+	fmacd	d4 , d0 , d8
+	fldd	d6 , [CO1, #16 ]
+	fmacd	d5 , d0 , d9
+	fldd	d7 , [CO1, #24 ]
+	fmacd	d6 , d0 , d10
+	fstd	d4 , [CO1]
+	fmacd	d7 , d0 , d11
+
+	fstd	d5 , [CO1, #8 ]
+	fstd	d6 , [CO1, #16 ]
+	fstd	d7 , [CO1, #24 ]
+
+	fldd	d4 , [CO2]
+	fldd	d5 , [CO2, #8 ]
+
+	pld	[ CO2, #C_PRE ]
+	fmacd	d4 , d0 , d12
+	fldd	d6 , [CO2, #16 ]
+	fmacd	d5 , d0 , d13
+	fldd	d7 , [CO2, #24 ]
+	fmacd	d6 , d0 , d14
+	fstd	d4 , [CO2]
+	fmacd	d7 , d0 , d15
+	add	CO1, CO1, #32
+
+	fstd	d5 , [CO2, #8 ]
+	fstd	d6 , [CO2, #16 ]
+	fstd	d7 , [CO2, #24 ]
+
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+	vsub.f64		d8 , d8 , d8
+	vmov.f64		d9, d8
+	vmov.f64		d12, d8
+	vmov.f64		d13, d8
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+	fldd	d4 , [ BO ]
+	fldd	d5 , [ BO, #8 ]
+
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+
+	fmacd	d8  , d0,  d4
+	fmacd	d9  , d1,  d4
+
+	fmacd	d12  , d0,  d5
+	fmacd	d13  , d1,  d5
+
+	add	AO , AO, #16
+	add	BO , BO, #16
+
+.endm
+
+.macro SAVE2x2
+
+	ldr	r3  , LDC
+	add	CO2 , CO1, r3
+
+	fldd		d0, ALPHA
+
+	fldd	d4 , [CO1]
+	fldd	d5 , [CO1, #8 ]
+		
+	fmacd	d4 , d0 , d8
+	fmacd	d5 , d0 , d9
+
+	fstd	d4 , [CO1]
+	fstd	d5 , [CO1, #8 ]
+
+	fldd	d4 , [CO2]
+	fldd	d5 , [CO2, #8 ]
+
+	fmacd	d4 , d0 , d12
+	fmacd	d5 , d0 , d13
+
+	fstd	d4 , [CO2]
+	fstd	d5 , [CO2, #8 ]
+
+	add	CO1, CO1, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+	vsub.f64		d8 , d8 , d8
+	vmov.f64		d12, d8
+
+.endm
+
+.macro KERNEL1x2_SUB
+
+	fldd	d4 , [ BO ]
+	fldd	d5 , [ BO, #8 ]
+
+	fldd	d0 , [ AO ]
+
+	fmacd	d8  , d0,  d4
+
+	fmacd	d12  , d0,  d5
+
+	add	AO , AO, #8
+	add	BO , BO, #16
+
+.endm
+
+.macro SAVE1x2
+
+	ldr	r3  , LDC
+	add	CO2 , CO1, r3
+
+	fldd		d0, ALPHA
+
+	fldd	d4 , [CO1]
+		
+	fmacd	d4 , d0 , d8
+
+	fstd	d4 , [CO1]
+
+	fldd	d4 , [CO2]
+
+	fmacd	d4 , d0 , d12
+
+	fstd	d4 , [CO2]
+
+	add	CO1, CO1, #8
+
+.endm
+
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+
+	vsub.f64		d8 , d8 , d8
+	vmov.f64		d9, d8
+	vmov.f64		d10, d8
+	vmov.f64		d11, d8
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+	fldd	d4 , [ BO ]
+
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+	fldd	d2 , [ AO, #16 ]
+	fldd	d3 , [ AO, #24 ]
+
+	fmacd	d8  , d0,  d4
+	fmacd	d9  , d1,  d4
+	fmacd	d10 , d2,  d4
+	fmacd	d11 , d3,  d4
+
+	add	AO , AO, #32
+	add	BO , BO, #8
+
+.endm
+
+.macro SAVE4x1
+
+	fldd		d0, ALPHA
+
+	fldd	d4 , [CO1]
+	fldd	d5 , [CO1, #8 ]
+	fldd	d6 , [CO1, #16 ]
+	fldd	d7 , [CO1, #24 ]
+		
+	fmacd	d4 , d0 , d8
+	fmacd	d5 , d0 , d9
+	fmacd	d6 , d0 , d10
+	fmacd	d7 , d0 , d11
+
+	fstd	d4 , [CO1]
+	fstd	d5 , [CO1, #8 ]
+	fstd	d6 , [CO1, #16 ]
+	fstd	d7 , [CO1, #24 ]
+
+	add	CO1, CO1, #32
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+	vsub.f64		d8 , d8 , d8
+	vmov.f64		d9 , d8
+
+.endm
+
+.macro KERNEL2x1_SUB
+
+	fldd	d4 , [ BO ]
+
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+
+	fmacd	d8  , d0,  d4
+	fmacd	d9  , d1,  d4
+
+	add	AO , AO, #16
+	add	BO , BO, #8
+
+.endm
+
+.macro SAVE2x1
+
+	fldd		d0, ALPHA
+
+	fldd	d4 , [CO1]
+	fldd	d5 , [CO1, #8 ]
+		
+	fmacd	d4 , d0 , d8
+	fmacd	d5 , d0 , d9
+
+	fstd	d4 , [CO1]
+	fstd	d5 , [CO1, #8 ]
+
+	add	CO1, CO1, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+	vsub.f64		d8 , d8 , d8
+
+.endm
+
+.macro KERNEL1x1_SUB
+
+	fldd	d4 , [ BO ]
+
+	fldd	d0 , [ AO ]
+
+	fmacd	d8  , d0,  d4
+
+	add	AO , AO, #8
+	add	BO , BO, #8
+
+.endm
+
+.macro SAVE1x1
+
+	fldd		d0, ALPHA
+
+	fldd	d4 , [CO1]
+		
+	fmacd	d4 , d0 , d8
+
+	fstd	d4 , [CO1]
+
+	add	CO1, CO1, #8
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	str	OLD_M, M
+	str	OLD_N, N
+	str	OLD_K, K
+	str	OLD_A, A
+	vstr	OLD_ALPHA, ALPHA
+
+	sub	r3, fp, #128
+	vstm	r3, { d8 - d15} 				// store floating point registers
+
+	ldr	r3, OLD_LDC
+	lsl	r3, r3, #3					// ldc = ldc * 8
+	str	r3, LDC
+
+	ldr	K1, K
+	ldr	BC, B
+
+	ldr	J, N
+	asrs	J, J, #1					// J = J / 2
+	ble	dgemm_kernel_L1_BEGIN
+
+
+/*********************************************************************************************/
+
+dgemm_kernel_L2_BEGIN:
+
+	ldr	CO1, C						// CO1 = C
+	ldr	r4 , LDC
+	lsl	r4 , r4 , #1					// LDC * 2
+	add	r3 , r4, CO1
+	str	r3 , C						// store C
+
+	ldr	AO, A						// AO = A
+
+dgemm_kernel_L2_M4_BEGIN:
+
+	ldr	I, M
+	asrs	I, I, #2					// I = I / 4
+	ble	dgemm_kernel_L2_M2_BEGIN
+
+dgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L2_M4_40
+	.align 5
+
+dgemm_kernel_L2_M4_22:
+
+	pld	[ BO, #B_PRE ]
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	pld	[ BO, #B_PRE ]
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	pld	[ BO, #B_PRE ]
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	pld	[ BO, #B_PRE ]
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M4_22
+	
+
+dgemm_kernel_L2_M4_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L2_M4_100
+
+dgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M4_42
+	
+dgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+dgemm_kernel_L2_M4_END:
+
+	subs	I, I, #1
+	bgt	dgemm_kernel_L2_M4_20
+
+
+dgemm_kernel_L2_M2_BEGIN:
+
+	ldr	I, M
+	tst	I , #3
+	ble	dgemm_kernel_L2_END
+
+	tst	I, #2					// I = I / 2
+	ble	dgemm_kernel_L2_M1_BEGIN
+
+dgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L2_M2_40
+
+dgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M2_22
+	
+
+dgemm_kernel_L2_M2_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L2_M2_100
+
+dgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M2_42
+	
+dgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+dgemm_kernel_L2_M2_END:
+
+
+dgemm_kernel_L2_M1_BEGIN:
+
+	tst	I, #1					// I = I % 2
+	ble	dgemm_kernel_L2_END
+
+dgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L2_M1_40
+
+dgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M1_22
+	
+
+dgemm_kernel_L2_M1_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L2_M1_100
+
+dgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L2_M1_42
+	
+dgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+
+dgemm_kernel_L2_END:
+
+	mov	r3, BC
+	mov	r4, K1
+	lsl	r4, r4, #4					// k * 2 * 8
+	add	r3, r3, r4					// B = B + K * 2 * 8
+	mov	BC, r3
+
+	subs	J , #1						// j--
+	bgt	dgemm_kernel_L2_BEGIN
+
+/*********************************************************************************************/
+
+dgemm_kernel_L1_BEGIN:
+
+	ldr	J , N
+	tst	J , #1
+	ble	dgemm_kernel_L999
+
+	
+	ldr	CO1, C						// CO1 = C
+	ldr	r4 , LDC
+	add	r3 , r4, CO1
+	str	r3 , C						// store C
+
+	ldr	AO, A						// AO = A
+
+
+
+dgemm_kernel_L1_M4_BEGIN:
+
+	ldr	I, M
+	asrs	I, I, #2					// I = I / 4
+	ble	dgemm_kernel_L1_M2_BEGIN
+
+dgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L1_M4_40
+	.align 5
+
+dgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M4_22
+	
+
+dgemm_kernel_L1_M4_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L1_M4_100
+
+dgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M4_42
+	
+dgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+dgemm_kernel_L1_M4_END:
+
+	subs	I, I, #1
+	bgt	dgemm_kernel_L1_M4_20
+
+
+dgemm_kernel_L1_M2_BEGIN:
+
+	ldr	I, M
+	tst	I , #3
+	ble	dgemm_kernel_L1_END
+
+	tst	I, #2					// I = I / 2
+	ble	dgemm_kernel_L1_M1_BEGIN
+
+dgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L1_M2_40
+
+dgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M2_22
+	
+
+dgemm_kernel_L1_M2_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L1_M2_100
+
+dgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M2_42
+	
+dgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+dgemm_kernel_L1_M2_END:
+
+
+dgemm_kernel_L1_M1_BEGIN:
+
+	tst	I, #1					// I = I % 2
+	ble	dgemm_kernel_L1_END
+
+dgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	BO, BC
+	asrs	L , K1, #3					// L = L / 8
+	ble	dgemm_kernel_L1_M1_40
+
+dgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M1_22
+	
+
+dgemm_kernel_L1_M1_40:
+	
+	ands	L , K1, #7					// L = L % 8
+	ble	dgemm_kernel_L1_M1_100
+
+dgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L1_M1_42
+	
+dgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+dgemm_kernel_L1_END:
+
+
+dgemm_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	movs	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
--- a/kernel/arm/dgemm_ncopy_2_vfp.S
+++ b/kernel/arm/dgemm_ncopy_2_vfp.S
@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/24 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_A	r2
+#define	OLD_LDA	r3
+
+#define B	[fp, #4 ]
+
+#define M	r0
+#define N	r1
+#define A	r2
+
+#define	BO	r5
+
+#define	AO1	r6
+#define	AO2	r7
+#define	LDA	r8
+
+#define I	r3
+#define	J	r12
+
+#define A_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY2x2
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d2 , [ AO1, #8  ]
+
+	fldd	d1 , [ AO2, #0  ]
+	fldd	d3 , [ AO2, #8  ]
+
+	add	AO1, AO1, #16
+	fstmiad	BO!, { d0 - d3 }
+	add	AO2, AO2, #16
+
+.endm
+
+
+.macro COPY1x2
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO2, #0  ]
+	add	AO1, AO1, #8
+
+	fstmiad	BO!, { d0 - d1 }
+	add	AO2, AO2, #8
+
+.endm
+
+.macro COPY2x1
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO1, #8  ]
+
+	fstmiad	BO!, { d0 - d1 }
+	add	AO1, AO1, #16
+
+.endm
+
+
+.macro COPY1x1
+
+	fldd	d0 , [ AO1, #0  ]
+
+	fstmiad	BO!, { d0 }
+	add	AO1, AO1, #8
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+
+	lsl	LDA, OLD_LDA, #3					// lda = lda * 8
+
+	ldr	BO, B	
+
+
+/*********************************************************************************************/
+
+dgemm_ncopy_L2_BEGIN:
+
+	asrs	J, N, #1					// J = N / 2
+	ble	dgemm_ncopy_L1_BEGIN
+
+dgemm_ncopy_L2_M2_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	add	AO2, AO1, LDA
+	add	A  , AO2, LDA 					// A = A + 2 * LDA
+
+	asrs	I, M, #1					// I = M / 2
+	ble	dgemm_ncopy_L2_M2_40
+
+dgemm_ncopy_L2_M2_20:
+
+	COPY2x2
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L2_M2_20
+	
+	
+dgemm_ncopy_L2_M2_40:
+
+	ands	I, M , #1
+	ble	dgemm_ncopy_L2_M2_END
+
+dgemm_ncopy_L2_M2_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L2_M2_60
+	
+
+dgemm_ncopy_L2_M2_END:
+
+	subs	J , J, #1						// j--
+	bne	dgemm_ncopy_L2_M2_BEGIN
+
+/*********************************************************************************************/
+
+dgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	dgemm_ncopy_L999
+
+
+dgemm_ncopy_L1_M2_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	add	A  , AO1, LDA 					// A = A + 1 * LDA
+
+	asrs	I, M, #1					// I = M / 2
+	ble	dgemm_ncopy_L1_M2_40
+
+dgemm_ncopy_L1_M2_20:
+
+	COPY2x1
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L1_M2_20
+	
+	
+dgemm_ncopy_L1_M2_40:
+
+	ands	I, M , #1
+	ble	dgemm_ncopy_L1_M2_END
+
+dgemm_ncopy_L1_M2_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L1_M2_60
+	
+
+dgemm_ncopy_L1_M2_END:
+
+
+
+dgemm_ncopy_L999:
+
+
+	movs	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/dgemm_ncopy_4_vfp.S
+++ b/kernel/arm/dgemm_ncopy_4_vfp.S
@ -0,0 +1,349 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/05 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_A	r2
+#define	OLD_LDA	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define LDA	[fp, #-260 ]
+
+#define B	[fp, #4 ]
+
+#define M	r0
+#define N	r1
+#define A	r2
+
+#define	BO	r5
+
+#define	AO1	r6
+#define	AO2	r7
+#define	AO3	r8
+#define	AO4	r9
+
+#define I	r3
+#define	J	r12
+
+#define A_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY4x4
+
+	pld	[ AO1, #A_PRE  ]
+	pld	[ AO2, #A_PRE  ]
+	pld	[ AO3, #A_PRE  ]
+	pld	[ AO4, #A_PRE  ]
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO2, #0  ]
+	fldd	d2 , [ AO3, #0  ]
+	fldd	d3 , [ AO4, #0  ]
+
+	fldd	d4 , [ AO1, #8  ]
+	fldd	d8 , [ AO1, #16 ]
+	fldd	d12, [ AO1, #24 ]
+
+	fldd	d5 , [ AO2, #8  ]
+	add	AO1, AO1, #32
+	fldd	d9 , [ AO2, #16 ]
+	fldd	d13, [ AO2, #24 ]
+
+	fldd	d6 , [ AO3, #8  ]
+	add	AO2, AO2, #32
+	fldd	d10, [ AO3, #16 ]
+	fldd	d14, [ AO3, #24 ]
+
+	fldd	d7 , [ AO4, #8  ]
+	add	AO3, AO3, #32
+	fldd	d11, [ AO4, #16 ]
+	fldd	d15, [ AO4, #24 ]
+
+	fstmiad	BO!, { d0 - d3 }
+	add	AO4, AO4, #32
+	fstmiad	BO!, { d4 - d7 }
+	fstmiad	BO!, { d8 - d15 }
+
+.endm
+
+.macro COPY1x4
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO2, #0  ]
+	add	AO1, AO1, #8
+	fldd	d2 , [ AO3, #0  ]
+	add	AO2, AO2, #8
+	fldd	d3 , [ AO4, #0  ]
+
+	add	AO3, AO3, #8
+	fstmiad	BO!, { d0 - d3 }
+	add	AO4, AO4, #8
+
+.endm
+
+.macro COPY4x2
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d2 , [ AO1, #8  ]
+	fldd	d4 , [ AO1, #16 ]
+	fldd	d6 , [ AO1, #24 ]
+
+	fldd	d1 , [ AO2, #0  ]
+	fldd	d3 , [ AO2, #8  ]
+	add	AO1, AO1, #32
+	fldd	d5 , [ AO2, #16 ]
+	fldd	d7 , [ AO2, #24 ]
+
+	fstmiad	BO!, { d0 - d7 }
+	add	AO2, AO2, #32
+
+.endm
+
+
+.macro COPY1x2
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO2, #0  ]
+	add	AO1, AO1, #8
+
+	fstmiad	BO!, { d0 - d1 }
+	add	AO2, AO2, #8
+
+.endm
+
+.macro COPY4x1
+
+	fldd	d0 , [ AO1, #0  ]
+	fldd	d1 , [ AO1, #8  ]
+	fldd	d2 , [ AO1, #16 ]
+	fldd	d3 , [ AO1, #24 ]
+
+	fstmiad	BO!, { d0 - d3 }
+	add	AO1, AO1, #32
+
+.endm
+
+
+.macro COPY1x1
+
+	fldd	d0 , [ AO1, #0  ]
+
+	fstmiad	BO!, { d0 }
+	add	AO1, AO1, #8
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+
+	lsl	r3, r3, #3					// lda = lda * 8
+	str	r3, LDA
+
+	sub	r4, fp, #128
+	vstm	r4, { d8 - d15} 				// store floating point registers
+
+	ldr	BO, B	
+
+dgemm_ncopy_L4_BEGIN:
+
+	asrs	J, N, #2					// J = N / 4
+	ble	dgemm_ncopy_L2_BEGIN
+
+dgemm_ncopy_L4_M4_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	ldr	r4 , LDA
+	add	AO2, AO1, r4
+	add	AO3, AO2, r4
+	add	AO4, AO3, r4
+	add	A  , AO4, r4					// A = A + 4 * LDA
+
+	asrs	I, M, #2					// I = M / 4
+	ble	dgemm_ncopy_L4_M4_40
+
+dgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L4_M4_20
+	
+	
+dgemm_ncopy_L4_M4_40:
+
+	ands	I, M , #3
+	ble	dgemm_ncopy_L4_M4_END
+
+dgemm_ncopy_L4_M4_60:
+
+	COPY1x4
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L4_M4_60
+	
+
+dgemm_ncopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	dgemm_ncopy_L4_M4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+dgemm_ncopy_L2_BEGIN:
+
+	tst	N, #3
+	ble	dgemm_ncopy_L999
+
+	tst	N, #2
+	ble	dgemm_ncopy_L1_BEGIN
+
+dgemm_ncopy_L2_M4_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	ldr	r4 , LDA
+	add	AO2, AO1, r4
+	add	A  , AO2, r4 					// A = A + 2 * LDA
+
+	asrs	I, M, #2					// I = M / 4
+	ble	dgemm_ncopy_L2_M4_40
+
+dgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L2_M4_20
+	
+	
+dgemm_ncopy_L2_M4_40:
+
+	ands	I, M , #3
+	ble	dgemm_ncopy_L2_M4_END
+
+dgemm_ncopy_L2_M4_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L2_M4_60
+	
+
+dgemm_ncopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+dgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	dgemm_ncopy_L999
+
+
+dgemm_ncopy_L1_M4_BEGIN:
+
+	mov	AO1, A						// AO1 = A
+	ldr	r4 , LDA
+	add	A  , AO1, r4 					// A = A + 1 * LDA
+
+	asrs	I, M, #2					// I = M / 4
+	ble	dgemm_ncopy_L1_M4_40
+
+dgemm_ncopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L1_M4_20
+	
+	
+dgemm_ncopy_L1_M4_40:
+
+	ands	I, M , #3
+	ble	dgemm_ncopy_L1_M4_END
+
+dgemm_ncopy_L1_M4_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	dgemm_ncopy_L1_M4_60
+	
+
+dgemm_ncopy_L1_M4_END:
+
+
+
+dgemm_ncopy_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	movs	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/dgemm_tcopy_4_vfp.S
+++ b/kernel/arm/dgemm_tcopy_4_vfp.S
@ -0,0 +1,408 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/06 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_M	r0
+#define	OLD_N	r1
+#define	OLD_A	r2
+#define	OLD_LDA	r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define B	[fp, #4 ]
+#define A	[fp, #-248 ]
+
+#define M	r0
+#define N	r1
+#define M4	r2
+
+#define	LDA	r5
+
+#define	AO1	r6
+#define	BO1	r7
+#define	BO2	r8
+#define	BO3	r9
+
+#define I	r4
+#define	J	r12
+
+#define A_PRE	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY4x4
+
+	pld	[ AO1, #A_PRE  ]
+	fldmiad	AO1, { d0 - d3 }
+
+	add	r3, AO1, LDA
+	pld	[ r3, #A_PRE  ]
+	fldmiad	r3, { d4 - d7 }
+
+	add	r3, r3, LDA
+	pld	[ r3, #A_PRE  ]
+	fldmiad	r3, { d8 - d11 }
+
+	add	r3, r3, LDA
+	pld	[ r3, #A_PRE  ]
+	fldmiad	r3, { d12 - d15 }
+
+	fstmiad	BO1, { d0 - d15 }
+	add	AO1, AO1, #32
+	add	BO1, BO1, M4
+
+.endm
+
+.macro COPY2x4
+
+	fldmiad	AO1, { d0 - d1 }
+
+	add	r3, AO1, LDA
+	fldmiad	r3, { d2 - d3 }
+
+	add	r3, r3, LDA
+	fldmiad	r3, { d4 - d5 }
+
+	add	r3, r3, LDA
+	fldmiad	r3, { d6 - d7 }
+
+	fstmiad	BO2, { d0 - d7 }
+	add	AO1, AO1, #16
+	add	BO2, BO2, #64
+
+.endm
+
+.macro COPY1x4
+
+	fldmiad	AO1, { d0 }
+
+	add	r3, AO1, LDA
+	fldmiad	r3, { d1 }
+
+	add	r3, r3, LDA
+	fldmiad	r3, { d2 }
+
+	add	r3, r3, LDA
+	fldmiad	r3, { d3 }
+
+	fstmiad	BO3, { d0 - d3 }
+	add	AO1, AO1, #8
+	add	BO3, BO3, #32
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY4x2
+
+	pld	[ AO1, #A_PRE  ]
+	fldmiad	AO1, { d0 - d3 }
+
+	add	r3, AO1, LDA
+	pld	[ r3, #A_PRE  ]
+	fldmiad	r3, { d4 - d7 }
+
+	fstmiad	BO1, { d0 - d7 }
+	add	AO1, AO1, #32
+	add	BO1, BO1, M4
+
+.endm
+
+.macro COPY2x2
+
+	fldmiad	AO1, { d0 - d1 }
+
+	add	r3, AO1, LDA
+	fldmiad	r3, { d2 - d3 }
+
+	fstmiad	BO2, { d0 - d3 }
+	add	AO1, AO1, #16
+	add	BO2, BO2, #32
+
+.endm
+
+.macro COPY1x2
+
+	fldmiad	AO1, { d0 }
+
+	add	r3, AO1, LDA
+	fldmiad	r3, { d1 }
+
+	fstmiad	BO3, { d0 - d1 }
+	add	AO1, AO1, #8
+	add	BO3, BO3, #16
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY4x1
+
+	pld	[ AO1, #A_PRE  ]
+	fldmiad	AO1, { d0 - d3 }
+
+	fstmiad	BO1, { d0 - d3 }
+	add	AO1, AO1, #32
+	add	BO1, BO1, M4
+
+.endm
+
+.macro COPY2x1
+
+	fldmiad	AO1, { d0 - d1 }
+
+	fstmiad	BO2, { d0 - d1 }
+	add	AO1, AO1, #16
+	add	BO2, BO2, #16
+
+.endm
+
+.macro COPY1x1
+
+	fldmiad	AO1, { d0 }
+
+	fstmiad	BO3, { d0 }
+	add	AO1, AO1, #8
+	add	BO3, BO3, #8
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	push	{r4 - r9, fp}
+	add	fp, sp, #24
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	str	OLD_A, A					// store A
+
+	lsl	LDA, OLD_LDA, #3				// lda = lda * SIZE
+
+	sub	r4, fp, #128
+	vstm	r4, { d8 - d15} 				// store floating point registers
+
+	lsl	r4 , M, #3					// M * SIZE
+
+	ldr	r3, B
+
+	and	BO2 , N , #-4
+	and	BO3 , N , #-2
+
+	mul	BO2, BO2, r4
+	mul	BO3, BO3, r4
+
+	add	BO2 , BO2, r3
+	add	BO3 , BO3, r3
+
+	lsl	M4, M, #5					// M4 = M * 4 * SIZE
+
+dgemm_tcopy_L4_BEGIN:
+
+	asrs	J, M, #2					// J = N / 4
+	ble	dgemm_tcopy_L2_BEGIN
+
+dgemm_tcopy_L4_M4_BEGIN:
+
+	ldr	AO1, A						// AO1 = A
+	lsl	r3, LDA, #2					// r3 = 4 * LDA
+	add	r3, r3 , AO1					// A = A + 4 * LDA
+	str	r3, A						// store A
+
+	ldr	BO1, B
+	add	r3, BO1, #128					// B = B + 16 * SIZE
+	str	r3, B
+
+	asrs	I, N, #2					// I = M / 4
+	ble	dgemm_tcopy_L4_M4_40
+
+dgemm_tcopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	dgemm_tcopy_L4_M4_20
+	
+	
+dgemm_tcopy_L4_M4_40:
+
+	tst	N , #2
+	ble	dgemm_tcopy_L4_M4_60
+
+	COPY2x4
+	
+
+dgemm_tcopy_L4_M4_60:
+
+	tst	N, #1
+	ble	dgemm_tcopy_L4_M4_END
+
+	COPY1x4
+	
+
+dgemm_tcopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	dgemm_tcopy_L4_M4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+dgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	dgemm_tcopy_L999
+
+	tst	M, #2
+	ble	dgemm_tcopy_L1_BEGIN
+
+dgemm_tcopy_L2_M4_BEGIN:
+
+	ldr	AO1, A						// AO1 = A
+	lsl	r3, LDA, #1					// r3 = 2 * LDA
+	add	r3, r3 , AO1					// A = A + 2 * LDA
+	str	r3, A						// store A
+
+	ldr	BO1, B
+	add	r3, BO1, #64					// B = B + 8 * SIZE
+	str	r3, B
+
+	asrs	I, N, #2					// I = M / 4
+	ble	dgemm_tcopy_L2_M4_40
+
+dgemm_tcopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	dgemm_tcopy_L2_M4_20
+	
+	
+dgemm_tcopy_L2_M4_40:
+
+	tst	N , #2
+	ble	dgemm_tcopy_L2_M4_60
+
+	COPY2x2
+
+dgemm_tcopy_L2_M4_60:
+
+	tst	N , #1
+	ble	dgemm_tcopy_L2_M4_END
+
+	COPY1x2
+
+
+dgemm_tcopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+dgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	dgemm_tcopy_L999
+
+
+dgemm_tcopy_L1_M4_BEGIN:
+
+	ldr	AO1, A						// AO1 = A
+	add	r3, LDA , AO1					// A = A + 1 * LDA
+	str	r3, A						// store A
+
+	ldr	BO1, B
+	add	r3, BO1, #32					// B = B + 4 * SIZE
+	str	r3, B
+
+	asrs	I, N, #2					// I = M / 4
+	ble	dgemm_tcopy_L1_M4_40
+
+dgemm_tcopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	dgemm_tcopy_L1_M4_20
+	
+	
+dgemm_tcopy_L1_M4_40:
+
+	tst	N , #2
+	ble	dgemm_tcopy_L1_M4_60
+
+	COPY2x1
+
+dgemm_tcopy_L1_M4_60:
+
+	tst	N , #1
+	ble	dgemm_tcopy_L1_M4_END
+
+	COPY1x1
+
+
+dgemm_tcopy_L1_M4_END:
+
+
+
+dgemm_tcopy_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	mov	r0, #0						// set return value
+	sub	sp, fp, #24
+	pop	{r4 - r9, fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/dot.c
+++ b/kernel/arm/dot.c
@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+	
+
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
--- a/kernel/arm/gemv_n.c
+++ b/kernel/arm/gemv_n.c
@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	ix = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = alpha * x[ix];
+		iy = 0;
+		for (i=0; i<m; i++)
+		{
+			y[iy] += temp * a_ptr[i];
+			iy += inc_y;
+		}
+		a_ptr += lda;
+		ix    += inc_x;
+	}
+
+}
+	
+
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@ -0,0 +1,740 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/28 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_M		r0
+
+#define AO1	r0
+#define N	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define M	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	64
+#define Y_PRE	0
+#define A_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	defined(DOUBLE)
+
+.macro INIT_F8
+
+	pld     [ YO , #Y_PRE ]
+	pld     [ YO , #Y_PRE+32 ]
+
+	vsub.f64	d8 , d8 , d8
+	vmov.f64	d9  , d8
+	vmov.f64	d10 , d8
+	vmov.f64	d11 , d8
+	vmov.f64	d12 , d8
+	vmov.f64	d13 , d8
+	vmov.f64	d14 , d8
+	vmov.f64	d15 , d8
+
+.endm
+
+.macro KERNEL_F8X8
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+.endm
+
+
+.macro KERNEL_F8X1
+
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	XO! ,  { d2 }
+	fldmiad	AO1 ,  { d4 - d7 }
+
+	vmla.f64	d8  , d2 , d4
+	pld	[ AO2 , #4*SIZE ]
+	vmla.f64	d9  , d2 , d5
+	add	r3, AO1, #4*SIZE
+	vmla.f64	d10 , d2 , d6
+	vmla.f64	d11 , d2 , d7
+
+
+	fldmiad	r3 ,  { d4 - d7 }
+
+	vmla.f64	d12 , d2 , d4
+	vmla.f64	d13 , d2 , d5
+	add		AO1, AO1, LDA
+	vmla.f64	d14 , d2 , d6
+	add		AO2, AO2, LDA
+	vmla.f64	d15 , d2 , d7
+
+
+.endm
+
+.macro	SAVE_F8
+
+	fldmiad	YO,  { d4 - d7 }
+
+	vmla.f64	d4 , d0, d8
+	vmla.f64	d5 , d0, d9
+	vmla.f64	d6 , d0, d10
+	vmla.f64	d7 , d0, d11
+
+	fstmiad	YO!, { d4 - d7 }
+
+	fldmiad	YO,  { d4 - d7 }
+
+	vmla.f64	d4 , d0, d12
+	vmla.f64	d5 , d0, d13
+	vmla.f64	d6 , d0, d14
+	vmla.f64	d7 , d0, d15
+
+	fstmiad	YO!, { d4 - d7 }
+
+.endm
+
+
+.macro INIT_F1
+
+	vsub.f64	d12 , d12 , d12
+
+.endm
+
+
+
+.macro KERNEL_F1X1
+
+	fldmiad	XO! ,  { d2 }
+	fldmiad	AO1 ,  { d8 }
+	vmla.f64	d12 , d2 , d8
+	add		AO1, AO1, LDA
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4, d0, d12
+	fstmiad	YO!, { d4 }
+
+.endm
+
+/*********************************************************************************************/
+
+.macro INIT_S4
+
+	vsub.f64	d12 , d12 , d12
+	vmov.f64	d13 , d12
+	vmov.f64	d14 , d12
+	vmov.f64	d15 , d12
+
+.endm
+
+.macro KERNEL_S4X4
+
+	KERNEL_S4X1
+	KERNEL_S4X1
+	KERNEL_S4X1
+	KERNEL_S4X1
+
+.endm
+
+
+.macro KERNEL_S4X1
+
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	XO  ,  { d2 }
+	fldmiad	AO1 ,  { d8 - d11 }
+
+	vmla.f64	d12 , d2 , d8
+	add		AO1, AO1, LDA
+	vmla.f64	d13 , d2 , d9
+	add		AO2, AO2, LDA
+	vmla.f64	d14 , d2 , d10
+	vmla.f64	d15 , d2 , d11
+	add		XO, XO , INC_X
+
+.endm
+
+.macro	SAVE_S4
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4 , d0, d12
+	fstmiad	YO,  { d4 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d5 }
+	vmla.f64	d5 , d0, d13
+	fstmiad	YO,  { d5 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4 , d0, d14
+	fstmiad	YO,  { d4 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d5 }
+	vmla.f64	d5 , d0, d15
+	fstmiad	YO,  { d5 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+.macro INIT_S1
+
+	vsub.f64	d12 , d12 , d12
+
+.endm
+
+
+
+.macro KERNEL_S1X1
+
+	fldmiad	XO  ,  { d2 }
+	fldmiad	AO1 ,  { d8 }
+	vmla.f64	d12 , d2 , d8
+	add		AO1, AO1, LDA
+	add		XO, XO , INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4, d0, d12
+	fstmiad	YO , { d4 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+
+#else	/************************* SINGLE PRECISION *****************************************/
+
+.macro INIT_F8
+
+	pld     [ YO , #Y_PRE ]
+
+	vsub.f32	s8 , s8 , s8
+	vmov.f32	s9  , s8
+	vmov.f32	s10 , s8
+	vmov.f32	s11 , s8
+	vmov.f32	s12 , s8
+	vmov.f32	s13 , s8
+	vmov.f32	s14 , s8
+	vmov.f32	s15 , s8
+
+.endm
+
+.macro KERNEL_F8X8
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+.endm
+
+
+.macro KERNEL_F8X1
+
+	pld	[ AO2, #A_PRE ]
+	fldmias	XO! ,  { s2 }
+	fldmias	AO1 ,  { s4 - s7 }
+
+	vmla.f32	s8  , s2 , s4
+	vmla.f32	s9  , s2 , s5
+	vmla.f32	s10 , s2 , s6
+	vmla.f32	s11 , s2 , s7
+
+	add	r3, AO1, #4*SIZE
+
+	fldmias	r3 ,  { s4 - s7 }
+
+	vmla.f32	s12 , s2 , s4
+	vmla.f32	s13 , s2 , s5
+	vmla.f32	s14 , s2 , s6
+	vmla.f32	s15 , s2 , s7
+
+	add		AO1, AO1, LDA
+	add		AO2, AO2, LDA
+
+.endm
+
+.macro	SAVE_F8
+
+	fldmias	YO,  { s4 - s7 }
+
+	vmla.f32	s4 , s0, s8
+	vmla.f32	s5 , s0, s9
+	vmla.f32	s6 , s0, s10
+	vmla.f32	s7 , s0, s11
+
+	fstmias	YO!, { s4 - s7 }
+
+
+	fldmias	YO,  { s4 - s7 }
+
+	vmla.f32	s4 , s0, s12
+	vmla.f32	s5 , s0, s13
+	vmla.f32	s6 , s0, s14
+	vmla.f32	s7 , s0, s15
+
+	fstmias	YO!, { s4 - s7 }
+
+.endm
+
+
+.macro INIT_F1
+
+	vsub.f32	s12 , s12 , s12
+
+.endm
+
+
+
+.macro KERNEL_F1X1
+
+	fldmias	XO! ,  { s2 }
+	fldmias	AO1 ,  { s8 }
+	vmla.f32	s12 , s2 , s8
+	add		AO1, AO1, LDA
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4, s0, s12
+	fstmias	YO!, { s4 }
+
+.endm
+
+/*********************************************************************************************/
+
+.macro INIT_S4
+
+	vsub.f32	s12 , s12 , s12
+	vmov.f32	s13 , s12
+	vmov.f32	s14 , s12
+	vmov.f32	s15 , s12
+
+.endm
+
+.macro KERNEL_S4X4
+
+	pld	[ AO2 , #A_PRE ]
+	KERNEL_S4X1
+	KERNEL_S4X1
+	pld	[ AO2 , #A_PRE ]
+	KERNEL_S4X1
+	KERNEL_S4X1
+
+.endm
+
+
+.macro KERNEL_S4X1
+
+	fldmias	XO  ,  { s2 }
+	fldmias	AO1 ,  { s8 - s11 }
+
+	vmla.f32	s12 , s2 , s8
+	vmla.f32	s13 , s2 , s9
+	vmla.f32	s14 , s2 , s10
+	vmla.f32	s15 , s2 , s11
+	add		AO1, AO1, LDA
+	add		AO2, AO2, LDA
+	add		XO, XO , INC_X
+
+.endm
+
+.macro	SAVE_S4
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4 , s0, s12
+	fstmias	YO,  { s4 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s5 }
+	vmla.f32	s5 , s0, s13
+	fstmias	YO,  { s5 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4 , s0, s14
+	fstmias	YO,  { s4 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s5 }
+	vmla.f32	s5 , s0, s15
+	fstmias	YO,  { s5 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+.macro INIT_S1
+
+	vsub.f32	s12 , s12 , s12
+
+.endm
+
+
+
+.macro KERNEL_S1X1
+
+	fldmias	XO  ,  { s2 }
+	fldmias	AO1 ,  { s8 }
+	vmla.f32	s12 , s2 , s8
+	add		AO1, AO1, LDA
+	add		XO, XO , INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4, s0, s12
+	fstmias	YO , { s4 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s15 }                                 // store floating point registers
+#endif
+
+	cmp	OLD_M, #0
+	ble	gemvn_kernel_L999
+
+	cmp	N, #0
+	ble	gemvn_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_M, M
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	gemvn_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	gemvn_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #3				// LDA * SIZE
+#else
+	lsl	LDA, LDA, #2				// LDA * SIZE
+#endif
+
+	cmp	INC_X, #1
+	bne	gemvn_kernel_S4_BEGIN
+
+	cmp	INC_Y, #1
+	bne	gemvn_kernel_S4_BEGIN
+
+
+gemvn_kernel_F4_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #3					// I = M / 8
+	ble	gemvn_kernel_F1_BEGIN
+
+gemvn_kernel_F4X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #8*SIZE
+	str	r3 , A
+
+	add	AO2, AO2, LDA
+	add	AO2, AO2, LDA
+
+	ldr	XO , X
+
+	INIT_F8
+
+	asrs	J, N, #3					// J = N / 8
+	ble	gemvn_kernel_F4X1
+
+
+gemvn_kernel_F4X4_10:
+
+	KERNEL_F8X8
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F4X4_10
+
+
+gemvn_kernel_F4X1:
+
+	ands	J, N , #7
+	ble	gemvn_kernel_F4_END
+
+gemvn_kernel_F4X1_10:
+
+	KERNEL_F8X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F4X1_10
+
+
+gemvn_kernel_F4_END:
+
+	SAVE_F8
+
+	subs	I , I , #1
+	bne	gemvn_kernel_F4X4
+
+
+gemvn_kernel_F1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #7
+	ble	gemvn_kernel_L999
+
+gemvn_kernel_F1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #SIZE
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_F1
+
+	mov	J, N
+
+
+gemvn_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F1X1_10
+
+
+gemvn_kernel_F1_END:
+
+	SAVE_F1
+
+	subs	I , I , #1
+	bne	gemvn_kernel_F1X1
+
+	b	gemvn_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+gemvn_kernel_S4_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
+#endif
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #2					// I = M / 4
+	ble	gemvn_kernel_S1_BEGIN
+
+gemvn_kernel_S4X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #4*SIZE
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S4
+
+	asrs	J, N, #2					// J = N / 4
+	ble	gemvn_kernel_S4X1
+
+
+gemvn_kernel_S4X4_10:
+
+	KERNEL_S4X4
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S4X4_10
+
+
+gemvn_kernel_S4X1:
+
+	ands	J, N , #3
+	ble	gemvn_kernel_S4_END
+
+gemvn_kernel_S4X1_10:
+
+	KERNEL_S4X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S4X1_10
+
+
+gemvn_kernel_S4_END:
+
+	SAVE_S4
+
+	subs	I , I , #1
+	bne	gemvn_kernel_S4X4
+
+
+gemvn_kernel_S1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #3
+	ble	gemvn_kernel_L999
+
+gemvn_kernel_S1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #SIZE
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_S1
+
+	mov	J, N
+
+
+gemvn_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S1X1_10
+
+
+gemvn_kernel_S1_END:
+
+	SAVE_S1
+
+	subs	I , I , #1
+	bne	gemvn_kernel_S1X1
+
+
+/*************************************************************************************************************/
+
+gemvn_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s15 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@ -0,0 +1,781 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/19 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_M		r0
+
+#define AO1	r0
+#define N	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define M	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	64
+#define Y_PRE	0
+#define A_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	defined(DOUBLE)
+
+.macro INIT_F8
+
+	pld     [ YO , #Y_PRE ]
+	pld     [ YO , #Y_PRE+32 ]
+
+	vsub.f64	d24 , d24 , d24
+	vmov.f64	d25 , d24
+	vmov.f64	d26 , d24
+	vmov.f64	d27 , d24
+	vmov.f64	d28 , d24
+	vmov.f64	d29 , d24
+	vmov.f64	d30 , d24
+	vmov.f64	d31 , d24
+
+.endm
+
+.macro KERNEL_F8X8
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+.endm
+
+
+.macro KERNEL_F8X1
+
+	fldmiad	XO! ,  { d4 }
+	fldmiad	AO1 ,  { d8 - d15 }
+
+	vmla.f64	d24 , d4 , d8
+	pld	[ AO2 , #A_PRE ]
+	vmla.f64	d25 , d4 , d9
+	pld	[ AO2 , #A_PRE+32 ]
+	vmla.f64	d26 , d4 , d10
+	vmla.f64	d27 , d4 , d11
+	vmla.f64	d28 , d4 , d12
+	vmla.f64	d29 , d4 , d13
+	add		AO1, AO1, LDA
+	vmla.f64	d30 , d4 , d14
+	add		AO2, AO2, LDA
+	vmla.f64	d31 , d4 , d15
+
+.endm
+
+.macro	SAVE_F8
+
+	fldmiad	YO,  { d16 - d23 }
+
+	vmla.f64	d16, d0, d24
+	vmla.f64	d17, d0, d25
+	vmla.f64	d18, d0, d26
+	vmla.f64	d19, d0, d27
+	vmla.f64	d20, d0, d28
+	vmla.f64	d21, d0, d29
+	vmla.f64	d22, d0, d30
+	vmla.f64	d23, d0, d31
+
+	fstmiad	YO!, { d16 - d23 }
+
+.endm
+
+
+.macro INIT_F1
+
+	vsub.f64	d24 , d24 , d24
+
+.endm
+
+
+
+.macro KERNEL_F1X1
+
+	fldmiad	XO! ,  { d4 }
+	fldmiad	AO1 ,  { d8 }
+	vmla.f64	d24 , d4 , d8
+	add		AO1, AO1, LDA
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmiad	YO,  { d16 }
+	vmla.f64	d16, d0, d24
+	fstmiad	YO!, { d16 }
+
+.endm
+
+/*********************************************************************************************/
+
+
+.macro INIT_S8
+
+	vsub.f64	d24 , d24 , d24
+	vmov.f64	d25 , d24
+	vmov.f64	d26 , d24
+	vmov.f64	d27 , d24
+	vmov.f64	d28 , d24
+	vmov.f64	d29 , d24
+	vmov.f64	d30 , d24
+	vmov.f64	d31 , d24
+
+.endm
+
+.macro KERNEL_S8X8
+
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+
+.endm
+
+
+.macro KERNEL_S8X1
+
+	pld	[ AO2 , #A_PRE ]
+	pld	[ AO2 , #A_PRE+32 ]
+	fldmiad	XO ,  { d4 }
+	fldmiad	AO1 ,  { d8 - d15 }
+
+	vmla.f64	d24 , d4 , d8
+	vmla.f64	d25 , d4 , d9
+	vmla.f64	d26 , d4 , d10
+	vmla.f64	d27 , d4 , d11
+	vmla.f64	d28 , d4 , d12
+	vmla.f64	d29 , d4 , d13
+	vmla.f64	d30 , d4 , d14
+	vmla.f64	d31 , d4 , d15
+	add		AO1, AO1, LDA
+	add		AO2, AO2, LDA
+	add		XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S8
+
+	fldmiad	YO,  { d16 }
+	vmla.f64	d16, d0, d24
+	fstmiad	YO,  { d16 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d17 }
+	vmla.f64	d17, d0, d25
+	fstmiad	YO,  { d17 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d18 }
+	vmla.f64	d18, d0, d26
+	fstmiad	YO,  { d18 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d19 }
+	vmla.f64	d19, d0, d27
+	fstmiad	YO,  { d19 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d20 }
+	vmla.f64	d20, d0, d28
+	fstmiad	YO,  { d20 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d21 }
+	vmla.f64	d21, d0, d29
+	fstmiad	YO,  { d21 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d22 }
+	vmla.f64	d22, d0, d30
+	fstmiad	YO,  { d22 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d23 }
+	vmla.f64	d23, d0, d31
+	fstmiad	YO,  { d23 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+.macro INIT_S1
+
+	vsub.f64	d24 , d24 , d24
+
+.endm
+
+
+
+.macro KERNEL_S1X1
+
+	fldmiad	XO  ,  { d4 }
+	fldmiad	AO1 ,  { d8 }
+	vmla.f64	d24 , d4 , d8
+	add		AO1, AO1, LDA
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmiad	YO,  { d16 }
+	vmla.f64	d16, d0, d24
+	fstmiad	YO,  { d16 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+#else	/************************* SINGLE PRECISION *****************************************/
+
+.macro INIT_F8
+
+	pld     [ YO , #Y_PRE ]
+
+	vsub.f32	s24 , s24 , s24
+	vmov.f32	s25 , s24
+	vmov.f32	s26 , s24
+	vmov.f32	s27 , s24
+	vmov.f32	s28 , s24
+	vmov.f32	s29 , s24
+	vmov.f32	s30 , s24
+	vmov.f32	s31 , s24
+
+.endm
+
+.macro KERNEL_F8X8
+
+	pld     [ XO , #X_PRE ]
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+	KERNEL_F8X1
+
+.endm
+
+
+.macro KERNEL_F8X1
+
+	pld	[ AO2 , #A_PRE ]
+	fldmias	XO! ,  { s4 }
+	fldmias	AO1 ,  { s8 - s15 }
+
+	vmla.f32	s24 , s4 , s8
+	vmla.f32	s25 , s4 , s9
+	vmla.f32	s26 , s4 , s10
+	vmla.f32	s27 , s4 , s11
+	vmla.f32	s28 , s4 , s12
+	vmla.f32	s29 , s4 , s13
+	vmla.f32	s30 , s4 , s14
+	vmla.f32	s31 , s4 , s15
+	add		AO1, AO1, LDA
+	add		AO2, AO2, LDA
+
+.endm
+
+.macro	SAVE_F8
+
+	fldmias	YO,  { s16 - s23 }
+
+	vmla.f32	s16, s0, s24
+	vmla.f32	s17, s0, s25
+	vmla.f32	s18, s0, s26
+	vmla.f32	s19, s0, s27
+	vmla.f32	s20, s0, s28
+	vmla.f32	s21, s0, s29
+	vmla.f32	s22, s0, s30
+	vmla.f32	s23, s0, s31
+
+	fstmias	YO!, { s16 - s23 }
+
+.endm
+
+
+.macro INIT_F1
+
+	vsub.f32	s24 , s24 , s24
+
+.endm
+
+
+
+.macro KERNEL_F1X1
+
+	fldmias	XO! ,  { s4 }
+	fldmias	AO1 ,  { s8 }
+	vmla.f32	s24 , s4 , s8
+	add		AO1, AO1, LDA
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmias	YO,  { s16 }
+	vmla.f32	s16, s0, s24
+	fstmias	YO!, { s16 }
+
+.endm
+
+/*********************************************************************************************/
+
+
+.macro INIT_S8
+
+	vsub.f32	s24 , s24 , s24
+	vmov.f32	s25 , s24
+	vmov.f32	s26 , s24
+	vmov.f32	s27 , s24
+	vmov.f32	s28 , s24
+	vmov.f32	s29 , s24
+	vmov.f32	s30 , s24
+	vmov.f32	s31 , s24
+
+.endm
+
+.macro KERNEL_S8X8
+
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+	KERNEL_S8X1
+
+.endm
+
+
+.macro KERNEL_S8X1
+
+	pld	[ AO2 , #A_PRE ]
+	fldmias	XO ,  { s4 }
+	fldmias	AO1 ,  { s8 - s15 }
+
+	vmla.f32	s24 , s4 , s8
+	vmla.f32	s25 , s4 , s9
+	vmla.f32	s26 , s4 , s10
+	vmla.f32	s27 , s4 , s11
+	vmla.f32	s28 , s4 , s12
+	vmla.f32	s29 , s4 , s13
+	vmla.f32	s30 , s4 , s14
+	vmla.f32	s31 , s4 , s15
+	add		AO1, AO1, LDA
+	add		AO2, AO2, LDA
+	add		XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S8
+
+	fldmias	YO,  { s16 }
+	vmla.f32	s16, s0, s24
+	fstmias	YO,  { s16 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s17 }
+	vmla.f32	s17, s0, s25
+	fstmias	YO,  { s17 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s18 }
+	vmla.f32	s18, s0, s26
+	fstmias	YO,  { s18 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s19 }
+	vmla.f32	s19, s0, s27
+	fstmias	YO,  { s19 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s20 }
+	vmla.f32	s20, s0, s28
+	fstmias	YO,  { s20 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s21 }
+	vmla.f32	s21, s0, s29
+	fstmias	YO,  { s21 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s22 }
+	vmla.f32	s22, s0, s30
+	fstmias	YO,  { s22 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s23 }
+	vmla.f32	s23, s0, s31
+	fstmias	YO,  { s23 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+.macro INIT_S1
+
+	vsub.f32	s24 , s24 , s24
+
+.endm
+
+
+
+.macro KERNEL_S1X1
+
+	fldmias	XO  ,  { s4 }
+	fldmias	AO1 ,  { s8 }
+	vmla.f32	s24 , s4 , s8
+	add		AO1, AO1, LDA
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmias	YO,  { s16 }
+	vmla.f32	s16, s0, s24
+	fstmias	YO,  { s16 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s31 }                                 // store floating point registers
+#endif
+
+	cmp	OLD_M, #0
+	ble	gemvn_kernel_L999
+
+	cmp	N, #0
+	ble	gemvn_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_M, M
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	gemvn_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	gemvn_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #3				// LDA * SIZE
+#else
+	lsl	LDA, LDA, #2				// LDA * SIZE
+#endif
+
+	cmp	INC_X, #1
+	bne	gemvn_kernel_S8_BEGIN
+
+	cmp	INC_Y, #1
+	bne	gemvn_kernel_S8_BEGIN
+
+
+gemvn_kernel_F8_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #3					// I = M / 8
+	ble	gemvn_kernel_F1_BEGIN
+
+gemvn_kernel_F8X8:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #8*SIZE
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_F8
+
+	asrs	J, N, #3					// J = N / 8
+	ble	gemvn_kernel_F8X1
+
+
+gemvn_kernel_F8X8_10:
+
+	KERNEL_F8X8
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F8X8_10
+
+
+gemvn_kernel_F8X1:
+
+	ands	J, N , #7
+	ble	gemvn_kernel_F8_END
+
+gemvn_kernel_F8X1_10:
+
+	KERNEL_F8X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F8X1_10
+
+
+gemvn_kernel_F8_END:
+
+	SAVE_F8
+
+	subs	I , I , #1
+	bne	gemvn_kernel_F8X8
+
+
+gemvn_kernel_F1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #7
+	ble	gemvn_kernel_L999
+
+gemvn_kernel_F1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #SIZE
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_F1
+
+	mov	J, N
+
+
+gemvn_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_F1X1_10
+
+
+gemvn_kernel_F1_END:
+
+	SAVE_F1
+
+	subs	I , I , #1
+	bne	gemvn_kernel_F1X1
+
+	b	gemvn_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+gemvn_kernel_S8_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
+#endif
+
+	ldr	YO , Y
+
+	ldr	I, M
+	asrs	I, I, #3					// I = M / 8
+	ble	gemvn_kernel_S1_BEGIN
+
+gemvn_kernel_S8X8:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO1, #8*SIZE
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S8
+
+	asrs	J, N, #3					// J = N / 8
+	ble	gemvn_kernel_S8X1
+
+
+gemvn_kernel_S8X8_10:
+
+	KERNEL_S8X8
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S8X8_10
+
+
+gemvn_kernel_S8X1:
+
+	ands	J, N , #7
+	ble	gemvn_kernel_S8_END
+
+gemvn_kernel_S8X1_10:
+
+	KERNEL_S8X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S8X1_10
+
+
+gemvn_kernel_S8_END:
+
+	SAVE_S8
+
+	subs	I , I , #1
+	bne	gemvn_kernel_S8X8
+
+
+gemvn_kernel_S1_BEGIN:
+
+	ldr	I, M
+	ands	I,  I , #7
+	ble	gemvn_kernel_L999
+
+gemvn_kernel_S1X1:
+
+	ldr	AO1, A
+	add	r3, AO1, #SIZE
+	str	r3, A
+	
+	ldr	XO , X
+
+	INIT_S1
+
+	mov	J, N
+
+
+gemvn_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	J, J, #1
+	bne	gemvn_kernel_S1X1_10
+
+
+gemvn_kernel_S1_END:
+
+	SAVE_S1
+
+	subs	I , I , #1
+	bne	gemvn_kernel_S1X1
+
+
+/*************************************************************************************************************/
+
+gemvn_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s31 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/gemv_t.c
+++ b/kernel/arm/gemv_t.c
@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	iy = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = 0.0;
+		ix = 0;
+		for (i=0; i<m; i++)
+		{
+			temp += a_ptr[i] * x[ix];
+			ix    += inc_x;
+		}
+		y[iy] += alpha * temp;
+		iy += inc_y;
+		a_ptr += lda;
+	}
+
+}
+	
+
--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@ -0,0 +1,750 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/25 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_N		r1
+
+#define M	r0
+#define AO1	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define N	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	512
+#define A_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	defined(DOUBLE)
+
+.macro INIT_F2
+
+	vsub.f64	d2 , d2 , d2
+	vsub.f64	d3 , d3 , d3
+
+.endm
+
+.macro KERNEL_F2X4
+
+	pld	[ XO  , #X_PRE ]
+	fldmiad	XO! ,  { d12 - d15 }
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	AO2!,  { d4 - d5 }
+	fldmiad	AO1!,  { d10 - d11 }
+	fldmiad	AO2!,  { d6 - d7 }
+
+	vmla.f64	d2 , d12 , d8
+	vmla.f64	d3 , d12 , d4
+	vmla.f64	d2 , d13 , d9
+	vmla.f64	d3 , d13 , d5
+	vmla.f64	d2 , d14, d10
+	vmla.f64	d3 , d14, d6
+	vmla.f64	d2 , d15, d11
+	vmla.f64	d3 , d15, d7
+
+.endm
+
+.macro KERNEL_F2X1
+
+	fldmiad	XO! ,  { d1 }
+	fldmiad	AO1!,  { d8 }
+	fldmiad	AO2!,  { d4 }
+	vmla.f64	d2 , d1 , d8
+	vmla.f64	d3 , d1 , d4
+
+.endm
+
+.macro	SAVE_F2
+
+	fldmiad	YO,  { d4 - d5 }
+	vmla.f64	d4, d0, d2
+	vmla.f64	d5, d0, d3
+	fstmiad	YO!, { d4 - d5 }
+
+.endm
+
+.macro INIT_F1
+
+	vsub.f64	d2 , d2 , d2
+
+.endm
+
+.macro KERNEL_F1X4
+
+	pld	[ XO  , #X_PRE ]
+	fldmiad	XO! ,  { d12 - d15 }
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+	fldmiad	AO1!,  { d10 - d11 }
+	vmla.f64	d2 , d12 , d8
+	vmla.f64	d2 , d13 , d9
+	vmla.f64	d2 , d14, d10
+	vmla.f64	d2 , d15, d11
+
+.endm
+
+.macro KERNEL_F1X1
+
+	fldmiad	XO! ,  { d1 }
+	fldmiad	AO1!,  { d8 }
+	vmla.f64	d2 , d1 , d8
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4, d0, d2
+	fstmiad	YO!, { d4 }
+
+.endm
+
+
+.macro INIT_S2
+
+	vsub.f64	d2 , d2 , d2
+	vsub.f64	d3 , d3 , d3
+
+.endm
+
+.macro KERNEL_S2X4
+
+	fldmiad	XO ,  { d12 }
+	add	XO, XO, INC_X
+
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	AO2!,  { d4 - d5 }
+
+	fldmiad	XO ,  { d13 }
+	add	XO, XO, INC_X
+	fldmiad	AO1!,  { d10 - d11 }
+	fldmiad	AO2!,  { d6 - d7 }
+
+	fldmiad	XO ,  { d14 }
+	add	XO, XO, INC_X
+
+	fldmiad	XO ,  { d15 }
+	add	XO, XO, INC_X
+
+	vmla.f64	d2 , d12 , d8
+	vmla.f64	d3 , d12 , d4
+	vmla.f64	d2 , d13 , d9
+	vmla.f64	d3 , d13 , d5
+	vmla.f64	d2 , d14, d10
+	vmla.f64	d3 , d14, d6
+	vmla.f64	d2 , d15, d11
+	vmla.f64	d3 , d15, d7
+
+.endm
+
+.macro KERNEL_S2X1
+
+	fldmiad	XO ,  { d1 }
+	fldmiad	AO1!,  { d8 }
+	fldmiad	AO2!,  { d4 }
+	vmla.f64	d2 , d1 , d8
+	add	XO, XO, INC_X
+	vmla.f64	d3 , d1 , d4
+
+.endm
+
+.macro	SAVE_S2
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4, d0, d2
+	fstmiad	YO, { d4  }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d5 }
+	vmla.f64	d5, d0, d3
+	fstmiad	YO, { d5  }
+	add	YO, YO, INC_Y
+
+.endm
+
+.macro INIT_S1
+
+	vsub.f64	d2 , d2 , d2
+
+.endm
+
+.macro KERNEL_S1X4
+
+	fldmiad	XO ,  { d12 }
+	add	XO, XO, INC_X
+
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+
+	fldmiad	XO ,  { d13 }
+	add	XO, XO, INC_X
+	fldmiad	AO1!,  { d10 - d11 }
+
+	fldmiad	XO ,  { d14 }
+	add	XO, XO, INC_X
+
+	fldmiad	XO ,  { d15 }
+	add	XO, XO, INC_X
+
+	vmla.f64	d2 , d12 , d8
+	vmla.f64	d2 , d13 , d9
+	vmla.f64	d2 , d14, d10
+	vmla.f64	d2 , d15, d11
+
+.endm
+
+.macro KERNEL_S1X1
+
+	fldmiad	XO ,  { d1 }
+	fldmiad	AO1!,  { d8 }
+	vmla.f64	d2 , d1 , d8
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmiad	YO,  { d4 }
+	vmla.f64	d4, d0, d2
+	fstmiad	YO, { d4  }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+#else	/************************* SINGLE PRECISION *****************************************/
+
+.macro INIT_F2
+
+	vsub.f32	s2 , s2 , s2
+	vsub.f32	s3 , s3 , s3
+
+.endm
+
+.macro KERNEL_F2X4
+
+	fldmias	XO! ,  { s12 - s15 }
+	fldmias	AO1!,  { s8 - s9   }
+	fldmias	AO2!,  { s4 - s5 }
+	fldmias	AO1!,  { s10 - s11 }
+	fldmias	AO2!,  { s6 - s7 }
+
+	vmla.f32	s2 , s12 , s8
+	vmla.f32	s3 , s12 , s4
+	vmla.f32	s2 , s13 , s9
+	vmla.f32	s3 , s13 , s5
+	vmla.f32	s2 , s14, s10
+	vmla.f32	s3 , s14, s6
+	vmla.f32	s2 , s15, s11
+	vmla.f32	s3 , s15, s7
+
+.endm
+
+.macro KERNEL_F2X1
+
+	fldmias	XO! ,  { s1 }
+	fldmias	AO1!,  { s8 }
+	fldmias	AO2!,  { s4 }
+	vmla.f32	s2 , s1 , s8
+	vmla.f32	s3 , s1 , s4
+
+.endm
+
+.macro	SAVE_F2
+
+	fldmias	YO,  { s4 - s5 }
+	vmla.f32	s4, s0, s2
+	vmla.f32	s5, s0, s3
+	fstmias	YO!, { s4 - s5 }
+
+.endm
+
+.macro INIT_F1
+
+	vsub.f32	s2 , s2 , s2
+
+.endm
+
+.macro KERNEL_F1X4
+
+	fldmias	XO! ,  { s12 - s15 }
+	fldmias	AO1!,  { s8 - s9   }
+	fldmias	AO1!,  { s10 - s11 }
+	vmla.f32	s2 , s12 , s8
+	vmla.f32	s2 , s13 , s9
+	vmla.f32	s2 , s14, s10
+	vmla.f32	s2 , s15, s11
+
+.endm
+
+.macro KERNEL_F1X1
+
+	fldmias	XO! ,  { s1 }
+	fldmias	AO1!,  { s8 }
+	vmla.f32	s2 , s1 , s8
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4, s0, s2
+	fstmias	YO!, { s4 }
+
+.endm
+
+
+.macro INIT_S2
+
+	vsub.f32	s2 , s2 , s2
+	vsub.f32	s3 , s3 , s3
+
+.endm
+
+.macro KERNEL_S2X4
+
+	fldmias	XO ,  { s12 }
+	add	XO, XO, INC_X
+
+	fldmias	AO1!,  { s8 - s9   }
+	fldmias	AO2!,  { s4 - s5 }
+
+	fldmias	XO ,  { s13 }
+	add	XO, XO, INC_X
+	fldmias	AO1!,  { s10 - s11 }
+	fldmias	AO2!,  { s6 - s7 }
+
+	fldmias	XO ,  { s14 }
+	add	XO, XO, INC_X
+
+	fldmias	XO ,  { s15 }
+	add	XO, XO, INC_X
+
+	vmla.f32	s2 , s12 , s8
+	vmla.f32	s3 , s12 , s4
+	vmla.f32	s2 , s13 , s9
+	vmla.f32	s3 , s13 , s5
+	vmla.f32	s2 , s14, s10
+	vmla.f32	s3 , s14, s6
+	vmla.f32	s2 , s15, s11
+	vmla.f32	s3 , s15, s7
+
+.endm
+
+.macro KERNEL_S2X1
+
+	fldmias	XO ,  { s1 }
+	fldmias	AO1!,  { s8 }
+	fldmias	AO2!,  { s4 }
+	vmla.f32	s2 , s1 , s8
+	add	XO, XO, INC_X
+	vmla.f32	s3 , s1 , s4
+
+.endm
+
+.macro	SAVE_S2
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4, s0, s2
+	fstmias	YO, { s4  }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s5 }
+	vmla.f32	s5, s0, s3
+	fstmias	YO, { s5  }
+	add	YO, YO, INC_Y
+
+.endm
+
+.macro INIT_S1
+
+	vsub.f32	s2 , s2 , s2
+
+.endm
+
+.macro KERNEL_S1X4
+
+	fldmias	XO ,  { s12 }
+	add	XO, XO, INC_X
+
+	pld	[ AO1 , #A_PRE ]
+	fldmias	AO1!,  { s8 - s9   }
+
+	fldmias	XO ,  { s13 }
+	add	XO, XO, INC_X
+	fldmias	AO1!,  { s10 - s11 }
+
+	fldmias	XO ,  { s14 }
+	add	XO, XO, INC_X
+
+	fldmias	XO ,  { s15 }
+	add	XO, XO, INC_X
+
+	vmla.f32	s2 , s12 , s8
+	vmla.f32	s2 , s13 , s9
+	vmla.f32	s2 , s14, s10
+	vmla.f32	s2 , s15, s11
+
+.endm
+
+.macro KERNEL_S1X1
+
+	fldmias	XO ,  { s1 }
+	fldmias	AO1!,  { s8 }
+	vmla.f32	s2 , s1 , s8
+	add	XO, XO, INC_X
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmias	YO,  { s4 }
+	vmla.f32	s4, s0, s2
+	fstmias	YO, { s4  }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s15 }                                 // store floating point registers
+#endif
+
+	cmp	M, #0
+	ble	gemvt_kernel_L999
+
+	cmp	OLD_N, #0
+	ble	gemvt_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_N, N
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	gemvt_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	gemvt_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #3				// LDA * SIZE
+#else
+	lsl	LDA, LDA, #2				// LDA * SIZE
+#endif
+
+	cmp	INC_X, #1
+	bne	gemvt_kernel_S2_BEGIN
+
+	cmp	INC_Y, #1
+	bne	gemvt_kernel_S2_BEGIN
+
+
+gemvt_kernel_F2_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	gemvt_kernel_F1_BEGIN
+
+gemvt_kernel_F2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_F2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_F2X1
+
+
+gemvt_kernel_F2X4_10:
+
+	KERNEL_F2X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F2X4_10
+
+
+gemvt_kernel_F2X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_F2_END
+
+gemvt_kernel_F2X1_10:
+
+	KERNEL_F2X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F2X1_10
+
+
+gemvt_kernel_F2_END:
+
+	SAVE_F2
+
+	subs	J , J , #1
+	bne	gemvt_kernel_F2X4
+
+
+gemvt_kernel_F1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	gemvt_kernel_L999
+
+gemvt_kernel_F1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_F1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_F1X1
+
+
+gemvt_kernel_F1X4_10:
+
+	KERNEL_F1X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F1X4_10
+
+
+gemvt_kernel_F1X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_F1_END
+
+gemvt_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F1X1_10
+
+
+gemvt_kernel_F1_END:
+
+	SAVE_F1
+
+	b	gemvt_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+gemvt_kernel_S2_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
+#endif
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	gemvt_kernel_S1_BEGIN
+
+gemvt_kernel_S2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_S2X1
+
+
+gemvt_kernel_S2X4_10:
+
+	KERNEL_S2X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S2X4_10
+
+
+gemvt_kernel_S2X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_S2_END
+
+gemvt_kernel_S2X1_10:
+
+	KERNEL_S2X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S2X1_10
+
+
+gemvt_kernel_S2_END:
+
+	SAVE_S2
+
+	subs	J , J , #1
+	bne	gemvt_kernel_S2X4
+
+
+gemvt_kernel_S1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	gemvt_kernel_L999
+
+gemvt_kernel_S1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_S1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_S1X1
+
+
+gemvt_kernel_S1X4_10:
+
+	KERNEL_S1X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S1X4_10
+
+
+gemvt_kernel_S1X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_S1_END
+
+gemvt_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S1X1_10
+
+
+gemvt_kernel_S1_END:
+
+	SAVE_S1
+
+
+
+/*************************************************************************************************************/
+
+gemvt_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s15 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@ -0,0 +1,732 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/18 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	OLD_LDA		[fp, #0 ]
+#define	X		[fp, #4 ]
+#define	OLD_INC_X	[fp, #8 ]
+#define	Y		[fp, #12 ]
+#define	OLD_INC_Y	[fp, #16 ]
+#define OLD_A		r3
+#define	OLD_N		r1
+
+#define M	r0
+#define AO1	r1
+#define J	r2
+
+#define AO2	r4
+#define XO	r5
+#define YO	r6
+#define LDA	r7
+#define INC_X	r8
+#define INC_Y	r9
+
+#define I	r12
+
+#define N	[fp, #-252 ]
+#define A	[fp, #-256 ]
+
+
+#define X_PRE	512
+#define A_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	defined(DOUBLE)
+
+.macro INIT_F2
+
+	vsub.f64	d4 , d4 , d4
+	vsub.f64	d5 , d5 , d5
+
+.endm
+
+.macro KERNEL_F2X4
+
+	pld	[ XO  , #X_PRE ]
+	fldmiad	XO! ,  { d28 - d31 }
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	AO2!,  { d16 - d17 }
+	vmla.f64	d4 , d28 , d8
+	vmla.f64	d5 , d28 , d16
+	fldmiad	AO1!,  { d10 - d11 }
+	vmla.f64	d4 , d29 , d9
+	vmla.f64	d5 , d29 , d17
+	fldmiad	AO2!,  { d18 - d19 }
+	vmla.f64	d4 , d30, d10
+	vmla.f64	d5 , d30, d18
+	vmla.f64	d4 , d31, d11
+	vmla.f64	d5 , d31, d19
+
+.endm
+
+
+.macro KERNEL_F2X1
+
+	fldmiad	XO! ,  { d2 }
+	fldmiad	AO1!,  { d8 }
+	fldmiad	AO2!,  { d16 }
+	vmla.f64	d4 , d2 , d8
+	vmla.f64	d5 , d2 , d16
+
+.endm
+
+.macro	SAVE_F2
+
+	fldmiad	YO,  { d24 - d25 }
+	vmla.f64	d24, d0, d4
+	vmla.f64	d25, d0, d5
+	fstmiad	YO!, { d24 - d25 }
+
+.endm
+
+.macro INIT_S2
+
+	vsub.f64	d4 , d4 , d4
+	vsub.f64	d5 , d5 , d5
+
+.endm
+
+.macro KERNEL_S2X4
+
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	XO ,  { d28 }
+	add	XO, XO, INC_X
+	fldmiad	AO1!,  { d8 - d9   }
+	pld	[ AO2 , #A_PRE ]
+	fldmiad	AO2!,  { d16 - d17 }
+	vmla.f64	d4 , d28 , d8
+	fldmiad	XO ,  { d29 }
+	add	XO, XO, INC_X
+	vmla.f64	d5 , d28 , d16
+	fldmiad	AO1!,  { d10 - d11 }
+	vmla.f64	d4 , d29 , d9
+	fldmiad	XO ,  { d30 }
+	add	XO, XO, INC_X
+	vmla.f64	d5 , d29 , d17
+	fldmiad	AO2!,  { d18 - d19 }
+	vmla.f64	d4 , d30, d10
+	fldmiad	XO ,  { d31 }
+	add	XO, XO, INC_X
+	vmla.f64	d5 , d30, d18
+	vmla.f64	d4 , d31, d11
+	vmla.f64	d5 , d31, d19
+
+.endm
+
+
+.macro KERNEL_S2X1
+
+	fldmiad	XO ,  { d2 }
+	fldmiad	AO1!,  { d8 }
+	add	XO, XO, INC_X
+	fldmiad	AO2!,  { d16 }
+	vmla.f64	d4 , d2 , d8
+	vmla.f64	d5 , d2 , d16
+
+.endm
+
+.macro	SAVE_S2
+
+	fldmiad	YO,  { d24 }
+	vmla.f64	d24, d0, d4
+	fstmiad	YO,  { d24 }
+	add	YO, YO, INC_Y
+
+	fldmiad	YO,  { d24 }
+	vmla.f64	d24, d0, d5
+	fstmiad	YO,  { d24 }
+	add	YO, YO, INC_Y
+
+.endm
+
+.macro INIT_F1
+
+	vsub.f64	d4 , d4 , d4
+
+.endm
+
+.macro KERNEL_F1X4
+
+	pld	[ XO  , #X_PRE ]
+	fldmiad	XO! ,  { d28 - d31 }
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	AO1!,  { d8 - d9   }
+	vmla.f64	d4 , d28 , d8
+	fldmiad	AO1!,  { d10 - d11 }
+	vmla.f64	d4 , d29 , d9
+	vmla.f64	d4 , d30, d10
+	vmla.f64	d4 , d31, d11
+
+.endm
+
+
+.macro KERNEL_F1X1
+
+	fldmiad	XO! ,  { d2 }
+	fldmiad	AO1!,  { d8 }
+	vmla.f64	d4 , d2 , d8
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmiad	YO,  { d24 }
+	vmla.f64	d24, d0, d4
+	fstmiad	YO!, { d24 }
+
+.endm
+
+.macro INIT_S1
+
+	vsub.f64	d4 , d4 , d4
+
+.endm
+
+.macro KERNEL_S1X4
+
+	pld	[ AO1 , #A_PRE ]
+	fldmiad	XO ,  { d28 }
+	add	XO, XO, INC_X
+	fldmiad	AO1!,  { d8 - d9   }
+	vmla.f64	d4 , d28 , d8
+	fldmiad	XO ,  { d29 }
+	add	XO, XO, INC_X
+	fldmiad	AO1!,  { d10 - d11 }
+	vmla.f64	d4 , d29 , d9
+	fldmiad	XO ,  { d30 }
+	add	XO, XO, INC_X
+	vmla.f64	d4 , d30, d10
+	fldmiad	XO ,  { d31 }
+	add	XO, XO, INC_X
+	vmla.f64	d4 , d31, d11
+
+.endm
+
+
+.macro KERNEL_S1X1
+
+	fldmiad	XO ,  { d2 }
+	fldmiad	AO1!,  { d8 }
+	add	XO, XO, INC_X
+	vmla.f64	d4 , d2 , d8
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmiad	YO,  { d24 }
+	vmla.f64	d24, d0, d4
+	fstmiad	YO,  { d24 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+#else	/************************* SINGLE PRECISION *****************************************/
+
+.macro INIT_F2
+
+	vsub.f32	s4 , s4 , s4
+	vsub.f32	s5 , s5 , s5
+
+.endm
+
+.macro KERNEL_F2X4
+
+	fldmias	XO! ,  { s28 - s31 }
+	fldmias	AO1!,  { s8 - s9   }
+	fldmias	AO2!,  { s16 - s17 }
+	vmla.f32	s4 , s28 , s8
+	vmla.f32	s5 , s28 , s16
+	fldmias	AO1!,  { s10 - s11 }
+	vmla.f32	s4 , s29 , s9
+	vmla.f32	s5 , s29 , s17
+	fldmias	AO2!,  { s18 - s19 }
+	vmla.f32	s4 , s30, s10
+	vmla.f32	s5 , s30, s18
+	vmla.f32	s4 , s31, s11
+	vmla.f32	s5 , s31, s19
+
+.endm
+
+
+.macro KERNEL_F2X1
+
+	fldmias	XO! ,  { s2 }
+	fldmias	AO1!,  { s8 }
+	fldmias	AO2!,  { s16 }
+	vmla.f32	s4 , s2 , s8
+	vmla.f32	s5 , s2 , s16
+
+.endm
+
+.macro	SAVE_F2
+
+	fldmias	YO,  { s24 - s25 }
+	vmla.f32	s24, s0, s4
+	vmla.f32	s25, s0, s5
+	fstmias	YO!, { s24 - s25 }
+
+.endm
+
+.macro INIT_S2
+
+	vsub.f32	s4 , s4 , s4
+	vsub.f32	s5 , s5 , s5
+
+.endm
+
+.macro KERNEL_S2X4
+
+	fldmias	XO ,  { s28 }
+	add	XO, XO, INC_X
+	fldmias	AO1!,  { s8 - s9   }
+	fldmias	AO2!,  { s16 - s17 }
+	vmla.f32	s4 , s28 , s8
+	fldmias	XO ,  { s29 }
+	add	XO, XO, INC_X
+	vmla.f32	s5 , s28 , s16
+	fldmias	AO1!,  { s10 - s11 }
+	vmla.f32	s4 , s29 , s9
+	fldmias	XO ,  { s30 }
+	add	XO, XO, INC_X
+	vmla.f32	s5 , s29 , s17
+	fldmias	AO2!,  { s18 - s19 }
+	vmla.f32	s4 , s30, s10
+	fldmias	XO ,  { s31 }
+	add	XO, XO, INC_X
+	vmla.f32	s5 , s30, s18
+	vmla.f32	s4 , s31, s11
+	vmla.f32	s5 , s31, s19
+
+.endm
+
+
+.macro KERNEL_S2X1
+
+	fldmias	XO ,  { s2 }
+	fldmias	AO1!,  { s8 }
+	add	XO, XO, INC_X
+	fldmias	AO2!,  { s16 }
+	vmla.f32	s4 , s2 , s8
+	vmla.f32	s5 , s2 , s16
+
+.endm
+
+.macro	SAVE_S2
+
+	fldmias	YO,  { s24 }
+	vmla.f32	s24, s0, s4
+	fstmias	YO,  { s24 }
+	add	YO, YO, INC_Y
+
+	fldmias	YO,  { s24 }
+	vmla.f32	s24, s0, s5
+	fstmias	YO,  { s24 }
+	add	YO, YO, INC_Y
+
+.endm
+
+.macro INIT_F1
+
+	vsub.f32	s4 , s4 , s4
+
+.endm
+
+.macro KERNEL_F1X4
+
+	fldmias	XO! ,  { s28 - s31 }
+	fldmias	AO1!,  { s8 - s9   }
+	vmla.f32	s4 , s28 , s8
+	fldmias	AO1!,  { s10 - s11 }
+	vmla.f32	s4 , s29 , s9
+	vmla.f32	s4 , s30, s10
+	vmla.f32	s4 , s31, s11
+
+.endm
+
+
+.macro KERNEL_F1X1
+
+	fldmias	XO! ,  { s2 }
+	fldmias	AO1!,  { s8 }
+	vmla.f32	s4 , s2 , s8
+
+.endm
+
+.macro	SAVE_F1
+
+	fldmias	YO,  { s24 }
+	vmla.f32	s24, s0, s4
+	fstmias	YO!, { s24 }
+
+.endm
+
+.macro INIT_S1
+
+	vsub.f32	s4 , s4 , s4
+
+.endm
+
+.macro KERNEL_S1X4
+
+	fldmias	XO ,  { s28 }
+	add	XO, XO, INC_X
+	fldmias	AO1!,  { s8 - s9   }
+	vmla.f32	s4 , s28 , s8
+	fldmias	XO ,  { s29 }
+	add	XO, XO, INC_X
+	fldmias	AO1!,  { s10 - s11 }
+	vmla.f32	s4 , s29 , s9
+	fldmias	XO ,  { s30 }
+	add	XO, XO, INC_X
+	vmla.f32	s4 , s30, s10
+	fldmias	XO ,  { s31 }
+	add	XO, XO, INC_X
+	vmla.f32	s4 , s31, s11
+
+.endm
+
+
+.macro KERNEL_S1X1
+
+	fldmias	XO ,  { s2 }
+	fldmias	AO1!,  { s8 }
+	add	XO, XO, INC_X
+	vmla.f32	s4 , s2 , s8
+
+.endm
+
+.macro	SAVE_S1
+
+	fldmias	YO,  { s24 }
+	vmla.f32	s24, s0, s4
+	fstmias	YO,  { s24 }
+	add	YO, YO, INC_Y
+
+.endm
+
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4 - r9 , fp}
+        add     fp, sp, #28
+	sub     sp, sp, #STACKSIZE                              // reserve stack
+
+        sub     r12, fp, #192
+
+#if	defined(DOUBLE)
+        vstm    r12, { d8 - d15 }                                 // store floating point registers
+#else
+        vstm    r12, { s8 - s31 }                                 // store floating point registers
+#endif
+
+	cmp	M, #0
+	ble	gemvt_kernel_L999
+
+	cmp	OLD_N, #0
+	ble	gemvt_kernel_L999
+
+	str	OLD_A, A
+	str	OLD_N, N
+
+	ldr    INC_X , OLD_INC_X
+	ldr    INC_Y , OLD_INC_Y
+
+	cmp	INC_X, #0
+	beq	gemvt_kernel_L999
+
+	cmp	INC_Y, #0
+	beq	gemvt_kernel_L999
+
+	ldr	LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+	lsl	LDA, LDA, #3				// LDA * SIZE
+#else
+	lsl	LDA, LDA, #2				// LDA * SIZE
+#endif
+
+	cmp	INC_X, #1
+	bne	gemvt_kernel_S2_BEGIN
+
+	cmp	INC_Y, #1
+	bne	gemvt_kernel_S2_BEGIN
+
+
+gemvt_kernel_F2_BEGIN:
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	gemvt_kernel_F1_BEGIN
+
+gemvt_kernel_F2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_F2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_F2X1
+
+
+gemvt_kernel_F2X4_10:
+
+	KERNEL_F2X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F2X4_10
+
+
+gemvt_kernel_F2X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_F2_END
+
+gemvt_kernel_F2X1_10:
+
+	KERNEL_F2X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F2X1_10
+
+
+gemvt_kernel_F2_END:
+
+	SAVE_F2
+
+	subs	J , J , #1
+	bne	gemvt_kernel_F2X4
+
+
+gemvt_kernel_F1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	gemvt_kernel_L999
+
+gemvt_kernel_F1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_F1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_F1X1
+
+
+gemvt_kernel_F1X4_10:
+
+	KERNEL_F1X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F1X4_10
+
+
+gemvt_kernel_F1X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_F1_END
+
+gemvt_kernel_F1X1_10:
+
+	KERNEL_F1X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_F1X1_10
+
+
+gemvt_kernel_F1_END:
+
+	SAVE_F1
+
+	b	gemvt_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+gemvt_kernel_S2_BEGIN:
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
+#endif
+
+	ldr	YO , Y
+
+	ldr	J, N
+	asrs	J, J, #1					// J = N / 2
+	ble	gemvt_kernel_S1_BEGIN
+
+gemvt_kernel_S2X4:
+
+	ldr	AO1, A
+	add	AO2, AO1, LDA
+	add	r3 , AO2, LDA
+	str	r3 , A
+
+	ldr	XO , X
+
+	INIT_S2
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_S2X1
+
+
+gemvt_kernel_S2X4_10:
+
+	KERNEL_S2X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S2X4_10
+
+
+gemvt_kernel_S2X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_S2_END
+
+gemvt_kernel_S2X1_10:
+
+	KERNEL_S2X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S2X1_10
+
+
+gemvt_kernel_S2_END:
+
+	SAVE_S2
+
+	subs	J , J , #1
+	bne	gemvt_kernel_S2X4
+
+
+gemvt_kernel_S1_BEGIN:
+
+	ldr	J, N
+	ands	J, J, #1
+	ble	gemvt_kernel_L999
+
+gemvt_kernel_S1X4:
+
+	ldr	AO1, A
+
+	ldr	XO , X
+
+	INIT_S1
+
+	asrs	I, M, #2					// I = M / 4
+	ble	gemvt_kernel_S1X1
+
+
+gemvt_kernel_S1X4_10:
+
+	KERNEL_S1X4
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S1X4_10
+
+
+gemvt_kernel_S1X1:
+
+	ands	I, M , #3
+	ble	gemvt_kernel_S1_END
+
+gemvt_kernel_S1X1_10:
+
+	KERNEL_S1X1
+
+	subs	I, I, #1
+	bne	gemvt_kernel_S1X1_10
+
+
+gemvt_kernel_S1_END:
+
+	SAVE_S1
+
+
+
+/*************************************************************************************************************/
+
+gemvt_kernel_L999:
+
+        sub     r3, fp, #192
+
+#if	defined(DOUBLE)
+        vldm    r3, { d8 - d15 }                                 // restore floating point registers
+#else
+        vldm    r3, { s8 - s31 }                                 // restore floating point registers
+#endif
+
+	mov	r0, #0		// set return value
+
+	sub     sp, fp, #28
+	pop     {r4 -r9 ,fp}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/iamax.c
+++ b/kernel/arm/iamax.c
@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n < 0 || inc_x < 1 ) return(max);
+
+	maxf=ABS(x[0]);
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > ABS(maxf) ) 
+		{
+			max = i;
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+	
+
--- a/kernel/arm/iamax_vfp.S
+++ b/kernel/arm/iamax_vfp.S
@ -0,0 +1,478 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/14 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+#define INDEX	r3
+#define Z	r4
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if	defined(USE_ABS)
+
+#if	defined(DOUBLE)
+
+#define	VABS(x0,x1)	vabs.f64	x0, x1
+
+#else
+
+#define	VABS(x0,x1)	vabs.f32	x0, x1
+
+#endif
+
+#else
+
+#define VABS(x0,x1)	nop
+
+#endif
+
+/*****************************************************************************************/
+
+#if	defined(USE_MIN)
+
+#define	MOVCOND		movlt
+
+#if	defined(DOUBLE)
+
+#define	VMOVCOND	vmovlt.f64
+
+#else
+
+#define	VMOVCOND	vmovlt.f32
+
+#endif
+
+#else
+
+#define	MOVCOND		movgt
+
+#if	defined(DOUBLE)
+
+#define	VMOVCOND	vmovgt.f64
+
+#else
+
+#define	VMOVCOND	vmovgt.f32
+
+#endif
+
+
+#endif
+
+
+/*****************************************************************************************/
+
+
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+.macro INIT_F
+
+	fldmiad	X!, { d0 }
+	VABS(   d0,  d0 )
+	mov	Z, #1
+	mov	INDEX, Z
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmiad	X!, { d4 }
+	add	Z, Z, #1
+	VABS(   d4,  d4 )
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	MOVCOND		INDEX, Z
+
+.endm
+
+.macro INIT_S
+
+	fldmiad	X, { d0 }
+	VABS(   d0,  d0 )
+	mov	Z, #1
+	mov	INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmiad	X, { d4 }
+	add	Z, Z, #1
+	VABS(   d4,  d4 )
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	MOVCOND		INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+	fldmias	X!, { s0 }
+	VABS(   s0,  s0 )
+	mov	Z, #1
+	mov	INDEX, Z
+
+.endm
+
+.macro KERNEL_F1
+
+	fldmias	X!, { s4 }
+	add	Z, Z, #1
+	VABS(   s4,  s4 )
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	MOVCOND		INDEX, Z
+
+.endm
+
+.macro INIT_S
+
+	fldmias	X, { s0 }
+	VABS(   s0,  s0 )
+	mov	Z, #1
+	mov	INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	fldmias	X, { s4 }
+	add	Z, Z, #1
+	VABS(   s4,  s4 )
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	MOVCOND		INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro INIT_F
+
+	fldmiad	X!, { d0 -d1 }
+	vabs.f64   d0,  d0
+	vabs.f64   d1,  d1
+	vadd.f64   d0  , d0,  d1
+	mov	Z, #1
+	mov	INDEX, Z
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmiad	X!, { d4 - d5 }
+	add	Z, Z, #1
+	vabs.f64   d4,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d4  , d4,  d5
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	MOVCOND		INDEX, Z
+
+.endm
+
+.macro INIT_S
+
+	fldmiad	X, { d0 -d1 }
+	vabs.f64   d0,  d0
+	vabs.f64   d1,  d1
+	vadd.f64   d0  , d0,  d1
+	mov	Z, #1
+	mov	INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+	fldmiad	X, { d4 - d5 }
+	add	Z, Z, #1
+	vabs.f64   d4,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d4  , d4,  d5
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	MOVCOND		INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+	fldmias	X!, { s0 -s1 }
+	vabs.f32   s0,  s0
+	vabs.f32   s1,  s1
+	vadd.f32   s0  , s0,  s1
+	mov	Z, #1
+	mov	INDEX, Z
+
+.endm
+
+
+.macro KERNEL_F1
+
+	fldmias	X!, { s4 - s5 }
+	add	Z, Z, #1
+	vabs.f32   s4,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s4  , s4,  s5
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	MOVCOND		INDEX, Z
+
+.endm
+
+.macro INIT_S
+
+	fldmias	X, { s0 -s1 }
+	vabs.f32   s0,  s0
+	vabs.f32   s1,  s1
+	vadd.f32   s0  , s0,  s1
+	mov	Z, #1
+	mov	INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+	fldmias	X, { s4 - s5 }
+	add	Z, Z, #1
+	vabs.f32   s4,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s4  , s4,  s5
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	MOVCOND		INDEX, Z
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	push    {r4}
+
+#if defined(DOUBLE)
+	vsub.f64                d0 , d0 , d0
+#else
+	vsub.f32                s0 , s0 , s0
+#endif
+	mov	INDEX, #0
+
+	cmp	N, #0
+	ble	iamax_kernel_L999
+
+	cmp	INC_X, #0
+	beq	iamax_kernel_L999
+
+	
+	cmp	INC_X, #1
+	bne	iamax_kernel_S_BEGIN
+
+
+iamax_kernel_F_BEGIN:
+
+	INIT_F
+
+	subs	N, N , #1
+	ble	iamax_kernel_L999
+
+	asrs	I, N, #2					// I = N / 4
+	ble	iamax_kernel_F1
+
+	.align 5
+
+iamax_kernel_F4:
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+
+	subs	I, I, #1
+	ble	iamax_kernel_F1
+
+
+#if defined(COMPLEX) || defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+
+	subs	I, I, #1
+	bne	iamax_kernel_F4
+
+iamax_kernel_F1:
+
+	ands	I, N, #3
+	ble	iamax_kernel_L999
+
+iamax_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     iamax_kernel_F10
+
+	b	iamax_kernel_L999
+
+iamax_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+	INIT_S
+
+	subs	N, N , #1
+	ble	iamax_kernel_L999
+
+	asrs	I, N, #2					// I = N / 4
+	ble	iamax_kernel_S1
+
+	.align 5
+
+iamax_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	iamax_kernel_S4
+
+iamax_kernel_S1:
+
+	ands	I, N, #3
+	ble	iamax_kernel_L999
+
+iamax_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     iamax_kernel_S10
+
+
+iamax_kernel_L999:
+
+	mov	r0, INDEX		// set return value
+
+	pop     {r4}
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/iamin.c
+++ b/kernel/arm/iamin.c
@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n < 0 || inc_x < 1 ) return(min);
+
+	minf=ABS(x[0]);
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) ) 
+		{
+			min = i;
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+	
+
--- a/kernel/arm/imax.c
+++ b/kernel/arm/imax.c
@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n < 0 || inc_x < 1 ) return(max);
+
+	maxf=x[0];
+
+	while(i < n)
+	{
+		if( x[ix] > maxf ) 
+		{
+			max = i;
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+	
+
--- a/kernel/arm/imin.c
+++ b/kernel/arm/imin.c
@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+*	 BLASTEST float		
+* 	 BLASTEST double	
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n < 0 || inc_x < 1 ) return(min);
+
+	minf=x[0];
+
+	while(i < n)
+	{
+		if( x[ix] > minf ) 
+		{
+			min = i;
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+	
+
--- a/kernel/arm/izamax.c
+++ b/kernel/arm/izamax.c
@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf[2];
+	BLASLONG max=0;
+	BLASLONG inc_x2;
+
+	if (n < 0 || inc_x < 1 ) return(max);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf[0] = ABS(x[ix]);
+	maxf[1] = ABS(x[ix+1]);
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > CABS1(maxf,0) ) 
+		{
+			max = i;
+			maxf[0] = ABS(x[ix]);
+			maxf[1] = ABS(x[ix+1]);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(max+1);
+}
+	
+
--- a/kernel/arm/izamin.c
+++ b/kernel/arm/izamin.c
@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf[2];
+	BLASLONG min=0;
+	BLASLONG inc_x2;
+
+	if (n < 0 || inc_x < 1 ) return(min);
+
+	inc_x2 = 2 * inc_x;
+
+	minf[0] = ABS(x[ix]);
+	minf[1] = ABS(x[ix+1]);
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < CABS1(minf,0) ) 
+		{
+			min = i;
+			minf[0] = ABS(x[ix]);
+			minf[1] = ABS(x[ix+1]);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(min+1);
+}
+	
+
--- a/kernel/arm/max.c
+++ b/kernel/arm/max.c
@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n < 0 || inc_x < 1 ) return(maxf);
+
+	maxf=x[0];
+
+	while(i < n)
+	{
+		if( x[ix] > maxf ) 
+		{
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+	
+
--- a/kernel/arm/min.c
+++ b/kernel/arm/min.c
@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n < 0 || inc_x < 1 ) return(minf);
+
+	minf=x[0];
+
+	while(i < n)
+	{
+		if( x[ix] < minf ) 
+		{
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+	
+
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	FLOAT absxi = 0.0;
+
+
+	if (n < 0 || inc_x < 1 ) return(0.0);
+	if ( n == 1 ) return( ABS(x[0]) );
+
+	n *= inc_x;
+	while(i < n)
+	{
+		
+		if ( x[i] != 0.0 )
+		{
+			absxi = ABS( x[i] );
+			if ( scale < absxi )
+			{
+				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+				scale = absxi ;
+			}
+			else
+			{
+				ssq += ( absxi/scale ) * ( absxi/scale );
+			}		
+
+		}
+		i += inc_x;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+	
+
--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/22 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+
+.macro KERNEL_F1
+
+	fldmiad	X!, 	{ d4 }
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad	X, 	{ d4 }
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_S1_NEXT:
+
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F1
+
+	fldmias	X!, 	{ s4 }
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias	X, 	{ s4 }
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_S1_NEXT:
+
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F1
+
+	fldmiad	X!, 	{ d4 - d5 }
+
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+	vcmpe.f64	d5, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_END_\@	
+	vabs.f64   	d5,  d5 
+	vcmpe.f64  	d0,  d5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_END_\@
+	vdiv.f64	d2 , d0, d5			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d5				// scale = x
+
+KERNEL_F1_END_\@:
+
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad	X, 	{ d4 - d5 }
+
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_S1_NEXT_\@:
+
+	vcmpe.f64	d5, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_END_\@	
+	vabs.f64   	d5,  d5 
+	vcmpe.f64  	d0,  d5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_END_\@
+	vdiv.f64	d2 , d0, d5			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d5				// scale = x
+
+KERNEL_S1_END_\@:
+
+	add	X, X, INC_X
+
+.endm
+
+
+#else
+
+.macro KERNEL_F1
+
+	fldmias	X!, 	{ s4 - s5 }
+
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+	vcmpe.f32	s5, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_END_\@	
+	vabs.f32   	s5,  s5 
+	vcmpe.f32  	s0,  s5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_END_\@
+	vdiv.f32	s2 , s0, s5			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s5				// scale = x
+
+KERNEL_F1_END_\@:
+
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias	X, 	{ s4 - s5 }
+
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_S1_NEXT_\@:
+
+	vcmpe.f32	s5, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_END_\@	
+	vabs.f32   	s5,  s5 
+	vcmpe.f32  	s0,  s5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_END_\@
+	vdiv.f32	s2 , s0, s5			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s5				// scale = x
+
+KERNEL_S1_END_\@:
+
+	add	X, X, INC_X
+
+.endm
+
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	b nrm2_begin
+
+
+#if	defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+znrm2_one:
+	.word	0x00000000
+	.word	0x3ff00000
+
+#else
+
+cnrm2_one:
+	.word	0x3f800000
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+dnrm2_one:
+	.word	0x00000000
+	.word	0x3ff00000
+
+#else
+
+snrm2_one:
+	.word	0x3f800000
+
+#endif
+
+#endif
+
+
+	.align 5
+
+
+nrm2_begin:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	vsub.f64                d0 , d0 , d0		// scale=0.0
+	vldr.64			d1 , znrm2_one		// ssq=1.0
+	vmov.f64		d7 , d1			// value 1.0 
+	vmov.f64		d6 , d0			// value 0.0 
+#else
+	vsub.f32                s0 , s0 , s0		// scale=0.0
+	vldr.32			s1 , cnrm2_one		// ssq=1.0
+	vmov.f32		s7 , s1			// value 1.0
+	vmov.f32		s6 , s0			// value 0.0 
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	vsub.f64                d0 , d0 , d0		// scale=0.0
+	vldr.64			d1 , dnrm2_one		// ssq=1.0
+	vmov.f64		d7 , d1			// value 1.0 
+	vmov.f64		d6 , d0			// value 0.0 
+#else
+	vsub.f32                s0 , s0 , s0		// scale=0.0
+	vldr.32			s1 , snrm2_one		// ssq=1.0
+	vmov.f32		s7 , s1			// value 1.0
+	vmov.f32		s6 , s0			// value 0.0 
+#endif
+
+
+#endif
+
+
+	cmp	N, #0
+	ble	nrm2_kernel_L999
+
+	cmp	INC_X, #0
+	beq	nrm2_kernel_L999
+
+	
+	cmp	INC_X, #1
+	bne	nrm2_kernel_S_BEGIN
+
+
+nrm2_kernel_F_BEGIN:
+
+	asrs	I, N, #3				// I = N / 8
+	ble	nrm2_kernel_F1
+
+nrm2_kernel_F8:
+
+	KERNEL_F8
+
+	subs    I, I, #1
+        bne     nrm2_kernel_F8
+
+nrm2_kernel_F1:
+
+	ands    I, N, #7
+        ble     nrm2_kernel_L999
+
+
+nrm2_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     nrm2_kernel_F10
+
+	b	nrm2_kernel_L999
+
+nrm2_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+
+
+nrm2_kernel_S1:
+
+	mov	I, N
+
+	.align 5
+
+nrm2_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     nrm2_kernel_S10
+
+
+nrm2_kernel_L999:
+
+#if defined(DOUBLE)
+	vsqrt.f64	d1, d1
+	vmul.f64	d0, d0, d1
+#else
+	vsqrt.f32	s1, s1
+	vmul.f32	s0, s0, s1
+#endif
+
+	bx	lr
+
+	EPILOGUE
+
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@ -0,0 +1,508 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/16 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+
+.macro KERNEL_F1
+
+	fldmiad	X!, 	{ d4 }
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad	X, 	{ d4 }
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_S1_NEXT:
+
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F1
+
+	fldmias	X!, 	{ s4 }
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias	X, 	{ s4 }
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_S1_NEXT:
+
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F1
+
+	fldmiad	X!, 	{ d4 - d5 }
+
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+	vcmpe.f64	d5, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_END_\@	
+	vabs.f64   	d5,  d5 
+	vcmpe.f64  	d0,  d5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_END_\@
+	vdiv.f64	d2 , d0, d5			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d5				// scale = x
+
+KERNEL_F1_END_\@:
+
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmiad	X, 	{ d4 - d5 }
+
+	vcmpe.f64	d4, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT_\@	
+	vabs.f64   	d4,  d4 
+	vcmpe.f64  	d0,  d4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT_\@
+	vdiv.f64	d2 , d0, d4			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d4				// scale = x
+
+KERNEL_S1_NEXT_\@:
+
+	vcmpe.f64	d5, d6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_END_\@	
+	vabs.f64   	d5,  d5 
+	vcmpe.f64  	d0,  d5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
+	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_END_\@
+	vdiv.f64	d2 , d0, d5			// scale / x
+	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
+	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f64	d0 , d5				// scale = x
+
+KERNEL_S1_END_\@:
+
+	add	X, X, INC_X
+
+.endm
+
+
+#else
+
+.macro KERNEL_F1
+
+	fldmias	X!, 	{ s4 - s5 }
+
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_F1_NEXT_\@:
+
+	vcmpe.f32	s5, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_F1_END_\@	
+	vabs.f32   	s5,  s5 
+	vcmpe.f32  	s0,  s5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_F1_END_\@
+	vdiv.f32	s2 , s0, s5			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s5				// scale = x
+
+KERNEL_F1_END_\@:
+
+
+.endm
+
+.macro KERNEL_F8
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+	KERNEL_F1
+
+.endm
+
+.macro KERNEL_S1
+
+	fldmias	X, 	{ s4 - s5 }
+
+	vcmpe.f32	s4, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_NEXT_\@	
+	vabs.f32   	s4,  s4 
+	vcmpe.f32  	s0,  s4				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_NEXT_\@
+	vdiv.f32	s2 , s0, s4			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s4				// scale = x
+
+KERNEL_S1_NEXT_\@:
+
+	vcmpe.f32	s5, s6				// compare with 0.0
+	vmrs		APSR_nzcv, fpscr
+	beq		KERNEL_S1_END_\@	
+	vabs.f32   	s5,  s5 
+	vcmpe.f32  	s0,  s5				// compare with scale
+	vmrs		APSR_nzcv, fpscr
+	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
+	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
+	bge		KERNEL_S1_END_\@
+	vdiv.f32	s2 , s0, s5			// scale / x
+	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
+	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
+	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
+	vmov.f32	s0 , s5				// scale = x
+
+KERNEL_S1_END_\@:
+
+	add	X, X, INC_X
+
+.endm
+
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+#if defined(DOUBLE)
+	vsub.f64                d0 , d0 , d0		// scale=0.0
+	vmov.f64		d1 , #1.0		// ssq=1.0
+	vmov.f64		d7 , d1			// value 1.0 
+	vmov.f64		d6 , d0			// value 0.0 
+#else
+	vsub.f32                s0 , s0 , s0		// scale=0.0
+	vmov.f32		s1 , #1.0		// ssq=1.0
+	vmov.f32		s7 , s1			// value 1.0
+	vmov.f32		s6 , s0			// value 0.0 
+#endif
+
+
+
+	cmp	N, #0
+	ble	nrm2_kernel_L999
+
+	cmp	INC_X, #0
+	beq	nrm2_kernel_L999
+
+	
+	cmp	INC_X, #1
+	bne	nrm2_kernel_S_BEGIN
+
+
+nrm2_kernel_F_BEGIN:
+
+	asrs	I, N, #3				// I = N / 8
+	ble	nrm2_kernel_F1
+
+nrm2_kernel_F8:
+
+	KERNEL_F8
+
+	subs    I, I, #1
+        bne     nrm2_kernel_F8
+
+nrm2_kernel_F1:
+
+	ands    I, N, #7
+        ble     nrm2_kernel_L999
+
+
+nrm2_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     nrm2_kernel_F10
+
+	b	nrm2_kernel_L999
+
+nrm2_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+
+
+nrm2_kernel_S1:
+
+	mov	I, N
+
+	.align 5
+
+nrm2_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     nrm2_kernel_S10
+
+
+nrm2_kernel_L999:
+
+#if defined(DOUBLE)
+	vsqrt.f64	d1, d1
+	vmul.f64	d0, d0, d1
+#else
+	vsqrt.f32	s1, s1
+	vmul.f32	s0, s0, s1
+#endif
+
+	bx	lr
+
+	EPILOGUE
+
--- a/Show More
+++ b/Show More