Merge pull request #323 from wernsaar/develop

Merge bulldozer, haswell, piledriver and armv7 branches
2013-12-03 06:47:03 -08:00 · 2013-12-03 06:47:03 -08:00 · ea74f331f4
parent 5048a80032 53eaf41901
commit ea74f331f4
158 changed files with 75877 additions and 240 deletions
--- a/Makefile.arm
+++ b/Makefile.arm
@ -0,0 +1,12 @@
 ifeq ($(CORE), ARMV7)
 CCOMMON_OPT += -marm -mfpu=vfpv3  -mfloat-abi=hard -march=armv7-a
 FCOMMON_OPT += -marm -mfpu=vfpv3  -mfloat-abi=hard -march=armv7-a
 endif
 ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard  -march=armv6
 FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard  -march=armv6
 endif
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -0,0 +1,7 @@
 ifeq ($(CORE), ARMV8)
 CCOMMON_OPT += -march=armv8-a
 FCOMMON_OPT += -march=armv8-a
 endif
--- a/Makefile.system
+++ b/Makefile.system
@ -336,14 +336,14 @@ ifeq ($(ARCH), x86)
 DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 	       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
 endif
 endif
 ifeq ($(ARCH), x86_64)
 DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
 endif
 endif
@ -373,6 +373,19 @@ NO_BINARY_MODE	= 1
 BINARY_DEFINED	= 1
 endif
 ifeq ($(ARCH), arm)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 endif
 ifeq ($(ARCH), arm64)
 NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 endif
 #
 #  C Compiler dependent settings
 #
@ -833,6 +846,19 @@ ifeq ($(DEBUG), 1)
 COMMON_OPT += -g
 endif
 ifndef COMMON_OPT
 ifeq ($(ARCH), arm)
 COMMON_OPT = -O3
 endif
 endif
 ifndef COMMON_OPT
 ifeq ($(ARCH), arm64)
 COMMON_OPT = -O3
 endif
 endif
 ifndef COMMON_OPT
 COMMON_OPT = -O2
 endif
@ -958,6 +984,10 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
 export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4
 export HAVE_NEON
 export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
--- a/4
+++ b/4
@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
 $architecture = alpha  if ($data =~ /ARCH_ALPHA/);
 $architecture = sparc  if ($data =~ /ARCH_SPARC/);
 $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $defined = 0;
@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
 $architecture = alpha  if ($data =~ /ARCH_ALPHA/);
 $architecture = sparc  if ($data =~ /ARCH_SPARC/);
 $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
--- a/cblas_noconst.h
+++ b/cblas_noconst.h
@ -0,0 +1,303 @@
 #ifndef CBLAS_H
 #define CBLAS_H
 #include <stddef.h>
 #include "common.h"
 #ifdef __cplusplus
 extern "C" {
 	/* Assume C declarations for C++ */
 #endif  /* __cplusplus */
 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
 /*Get the build configure on runtime.*/
 char* openblas_get_config(void);
 /* Get the parallelization type which is used by OpenBLAS */
 int openblas_get_parallel(void); 
 /* OpenBLAS is compiled for sequential use  */
 #define OPENBLAS_SEQUENTIAL  0
 /* OpenBLAS is compiled using normal threading model */
 #define OPENBLAS_THREAD  1 
 /* OpenBLAS is compiled using OpenMP threading model */
 #define OPENBLAS_OPENMP 2 
 #define CBLAS_INDEX size_t
 typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
 typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
 typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
 typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
 typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
 float  cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
 double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
 float  cblas_sdot(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
 openblas_complex_float  cblas_cdotu(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 openblas_complex_float  cblas_cdotc(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
 openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
 void  cblas_cdotu_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
 void  cblas_cdotc_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
 void  cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
 void  cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
 float  cblas_sasum (blasint n, float  *x, blasint incx);
 double cblas_dasum (blasint n, double *x, blasint incx);
 float  cblas_scasum(blasint n, float  *x, blasint incx);
 double cblas_dzasum(blasint n, double *x, blasint incx);
 float  cblas_snrm2 (blasint N, float  *X, blasint incX);
 double cblas_dnrm2 (blasint N, double *X, blasint incX);
 float  cblas_scnrm2(blasint N, float  *X, blasint incX);
 double cblas_dznrm2(blasint N, double *X, blasint incX);
 CBLAS_INDEX cblas_isamax(blasint n, float  *x, blasint incx);
 CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
 CBLAS_INDEX cblas_icamax(blasint n, float  *x, blasint incx);
 CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
 void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
 void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
 void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
 void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);
 void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
 void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
 void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
 void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
 void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
 void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
 void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
 void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
 void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
 void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double  s);
 void cblas_srotg(float *a, float *b, float *c, float *s);
 void cblas_drotg(double *a, double *b, double *c, double *s);
 void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
 void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
 void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
 void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
 void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
 void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
 void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
 void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
 void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
 void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
 void cblas_sgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 		 float alpha, float  *a, blasint lda,  float  *x, blasint incx,  float beta,  float  *y, blasint incy);
 void cblas_dgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 		 double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy);
 void cblas_cgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 		 float *alpha, float  *a, blasint lda,  float  *x, blasint incx,  float *beta,  float  *y, blasint incy);
 void cblas_zgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 		 double *alpha, double  *a, blasint lda,  double  *x, blasint incx,  double *beta,  double  *y, blasint incy);
 void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float   alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double  alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
 void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
 void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
 void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
 void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
                blasint incX, float *Y, blasint incY, float *A, blasint lda);
 void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
                blasint incX, double *Y, blasint incY, double *A, blasint lda);
 void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
                float *Y, blasint incY, float *A, blasint lda);
 void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
                double *Y, blasint incY, double *A, blasint lda);
 void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
                 blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
                 blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
                 blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
                 blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, float *Ap, float *X, blasint incX);
 void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, double *Ap, double *X, blasint incX);
 void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, float *Ap, float *X, blasint incX);
 void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, double *Ap, double *X, blasint incX);
 void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, float *Ap, float *X, blasint incX);
 void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, double *Ap, double *X, blasint incX);
 void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, float *Ap, float *X, blasint incX);
 void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
                 blasint N, double *Ap, double *X, blasint incX);
 void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
                 blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
                 blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
                 float *X, blasint incX, float beta, float *Y, blasint incY);
 void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
                 double *X, blasint incX, double beta, double *Y, blasint incY);
 void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
 void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
 void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
 void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
 void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
 void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
 void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
 void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
 void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
 		 float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
 		 double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
 		 float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
 void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
 		 double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
 void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 		 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 		 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 		 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 		 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		 blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
 void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		 blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
 void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		 blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
 void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		 blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
 void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		  blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		  blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		  blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 		  blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
 void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
 void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
 void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
 void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
 void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
 void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
 void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
 void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
                 float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
 void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
                 double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
 void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
                  float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
                  double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 void cblas_xerbla(blasint p, char *rout, char *form, ...);
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
 #endif
--- a/common.h
+++ b/common.h
@ -310,6 +310,15 @@ typedef int blasint;
 #define YIELDING	SwitchToThread()
 #endif
 #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8)
 #define YIELDING        asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
 #endif
 #ifdef PILEDRIVER
 #define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
 #endif
 #ifndef YIELDING
 #define YIELDING	sched_yield()
 #endif
@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips64.h"
 #endif
 #ifdef ARCH_ARM
 #include "common_arm.h"
 #endif
 #ifdef ARCH_ARM64
 #include "common_arm64.h"
 #endif
 #ifdef OS_LINUX
 #include "common_linux.h"
 #endif
--- a/common_arm.h
+++ b/common_arm.h
@ -0,0 +1,169 @@
 /*****************************************************************************
 Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
   3. Neither the name of the ISCAS nor the names of its contributors may 
      be used to endorse or promote products derived from this software 
      without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 #ifndef COMMON_ARM
 #define COMMON_ARM
 #define MB
 #define WMB
 #define INLINE inline
 #define RETURN_BY_COMPLEX
 #ifndef ASSEMBLER
 static void __inline blas_lock(volatile BLASULONG *address){
  int register ret;
  do {
    while (*address) {YIELDING;};
    __asm__ __volatile__(
                         "ldrex r2, [%1]                                                \n\t"
                         "mov   r2, #0                                                  \n\t"
                         "strex r3, r2, [%1]                                            \n\t"
 			 "mov	%0 , r3							\n\t"
                         : "=r"(ret), "=r"(address)
                         : "1"(address)
                         : "memory", "r2" , "r3" 
    );
  } while (ret);
 }
 static inline unsigned long long rpcc(void){
  unsigned long long ret=0;
  double v;
  struct timeval tv;
  gettimeofday(&tv,NULL);
  v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
  ret = (unsigned long long) ( v * 1000.0d );
  return ret;
 }
 static inline int blas_quickdivide(blasint x, blasint y){
  return x / y;
 }
 #if defined(DOUBLE)
 #define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
 #else
 #define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
 #endif
 #define GET_IMAGE_CANCEL
 #endif
 #ifndef F_INTERFACE
 #define REALNAME ASMNAME
 #else
 #define REALNAME ASMFNAME
 #endif
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 #define PROLOGUE \
 	.arm		 ;\
 	.global	REALNAME ;\
 	.func	REALNAME  ;\
 REALNAME:
 #define EPILOGUE 
 #define PROFCODE
 #endif
 #define SEEK_ADDRESS
 #ifndef PAGESIZE
 #define PAGESIZE        ( 4 << 10)
 #endif
 #define HUGE_PAGESIZE   ( 4 << 20)
 #define BUFFER_SIZE     (16 << 20)
 #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
 #ifndef MAP_ANONYMOUS
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 #endif
--- a/common_arm64.h
+++ b/common_arm64.h
@ -0,0 +1,169 @@
 /*****************************************************************************
 Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
   3. Neither the name of the ISCAS nor the names of its contributors may 
      be used to endorse or promote products derived from this software 
      without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 #ifndef COMMON_ARM64
 #define COMMON_ARM64
 #define MB
 #define WMB
 #define INLINE inline
 #define RETURN_BY_COMPLEX
 #ifndef ASSEMBLER
 static void __inline blas_lock(volatile BLASULONG *address){
 /*
  int register ret;
  do {
    while (*address) {YIELDING;};
    __asm__ __volatile__(
                         "ldrex r2, [%1]                                                \n\t"
                         "mov   r2, #0                                                  \n\t"
                         "strex r3, r2, [%1]                                            \n\t"
 			 "mov	%0 , r3							\n\t"
                         : "=r"(ret), "=r"(address)
                         : "1"(address)
                         : "memory", "r2" , "r3" 
    );
  } while (ret);
 */
 }
 static inline unsigned long long rpcc(void){
  unsigned long long ret=0;
  double v;
  struct timeval tv;
  gettimeofday(&tv,NULL);
  v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
  ret = (unsigned long long) ( v * 1000.0d );
  return ret;
 }
 static inline int blas_quickdivide(blasint x, blasint y){
  return x / y;
 }
 #if defined(DOUBLE)
 #define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
 #else
 #define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
 #endif
 #define GET_IMAGE_CANCEL
 #endif
 #ifndef F_INTERFACE
 #define REALNAME ASMNAME
 #else
 #define REALNAME ASMFNAME
 #endif
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 #define PROLOGUE \
 	.arm		 ;\
 	.global	REALNAME ;\
 	.func	REALNAME  ;\
 REALNAME:
 #define EPILOGUE 
 #define PROFCODE
 #endif
 #define SEEK_ADDRESS
 #ifndef PAGESIZE
 #define PAGESIZE        ( 4 << 10)
 #endif
 #define HUGE_PAGESIZE   ( 4 << 20)
 #define BUFFER_SIZE     (16 << 20)
 #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
 #ifndef MAP_ANONYMOUS
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 #endif
--- a/cpuid.h
+++ b/cpuid.h
@ -107,7 +107,7 @@
 #define CORE_BOBCAT     21
 #define CORE_BULLDOZER  22
 #define CORE_PILEDRIVER  23
-#define CORE_HASWELL CORE_SANDYBRIDGE
+#define CORE_HASWELL 24
 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@ -200,7 +200,6 @@ typedef struct {
 #define CPUTYPE_BOBCAT                  45
 #define CPUTYPE_BULLDOZER               46
 #define CPUTYPE_PILEDRIVER              47
-// this define is because BLAS doesn't have haswell specific optimizations yet
+#define CPUTYPE_HASWELL 48
 #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE 
 #endif
--- a/cpuid_arm.c
+++ b/cpuid_arm.c
@ -0,0 +1,262 @@
 /**************************************************************************
  Copyright (c) 2013, The OpenBLAS Project
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
  1. Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.
  2. Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer in
  the documentation and/or other materials provided with the
  distribution.
  3. Neither the name of the OpenBLAS project nor the names of
  its contributors may be used to endorse or promote products
  derived from this software without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 #include <string.h>
 #define CPU_UNKNOWN     	0
 #define CPU_ARMV6       	1
 #define CPU_ARMV7       	2
 #define CPU_CORTEXA15       	3
 static char *cpuname[] = {
  "UNKOWN",
  "ARMV6",
  "ARMV7",
  "CORTEXA15"
 };
 int get_feature(char *search)
 {
 #ifdef linux
 	FILE *infile;
  	char buffer[2048], *p,*t;
  	p = (char *) NULL ;
  	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 		if (!strncmp("Features", buffer, 8))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
      		}
  	}
  	fclose(infile);
 	if( p == NULL ) return;
 	t = strtok(p," ");
 	while( t = strtok(NULL," "))
 	{	
 		if (!strcmp(t, search))   { return(1); }
 	}
 #endif
 	return(0);
 }
 int detect(void)
 {
 #ifdef linux
 	FILE *infile;
  	char buffer[512], *p;
  	p = (char *) NULL ;
  	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 		if (!strncmp("model name", buffer, 10))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
      		}
  	}
  	fclose(infile);
  	if(p != NULL)
 	{
 		if (strstr(p, "ARMv7")) 
 		{
 			 if ( get_feature("vfpv4"))
 			 	return CPU_ARMV7;
 			 if ( get_feature("vfpv3"))
 			 	return CPU_ARMV7;
 			 if ( get_feature("vfp"))
 			 	return CPU_ARMV6;
 		}
 		if (strstr(p, "ARMv6")) 
 		{
 			 if ( get_feature("vfp"))
 			 	return CPU_ARMV6;
 		}
 	}
 #endif
 	return CPU_UNKNOWN;
 }
 char *get_corename(void)
 {
 	return cpuname[detect()];
 }
 void get_architecture(void)
 {
 	printf("ARM");
 }
 void get_subarchitecture(void)
 {
 	int d = detect();
 	switch (d)
 	{
 		case CPU_ARMV7:
 			printf("ARMV7");
 			break;
 		case CPU_ARMV6:
 			printf("ARMV6");
 			break;
 		default:
 			printf("UNKNOWN");
 			break;
 	}
 }
 void get_subdirname(void)
 {
 	printf("arm");
 }
 void get_cpuconfig(void)
 {
 	int d = detect();
 	switch (d)
 	{
 		case CPU_ARMV7:
    			printf("#define ARMV7\n");
    			printf("#define HAVE_VFP\n");
    			printf("#define HAVE_VFPV3\n");
 			if ( get_feature("neon"))	printf("#define HAVE_NEON\n");
 			if ( get_feature("vfpv4"))	printf("#define HAVE_VFPV4\n");
    			printf("#define L1_DATA_SIZE 65536\n");
    			printf("#define L1_DATA_LINESIZE 32\n");
    			printf("#define L2_SIZE 512488\n");
    			printf("#define L2_LINESIZE 32\n");
    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
    			printf("#define DTB_SIZE 4096\n");
    			printf("#define L2_ASSOCIATIVE 4\n");
 			break;
 		case CPU_ARMV6:
    			printf("#define ARMV6\n");
    			printf("#define HAVE_VFP\n");
    			printf("#define L1_DATA_SIZE 65536\n");
    			printf("#define L1_DATA_LINESIZE 32\n");
    			printf("#define L2_SIZE 512488\n");
    			printf("#define L2_LINESIZE 32\n");
    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
    			printf("#define DTB_SIZE 4096\n");
    			printf("#define L2_ASSOCIATIVE 4\n");
 			break;
 	}
 }
 void get_libname(void)
 {
 	int d = detect();
 	switch (d)
 	{
 		case CPU_ARMV7:
    			printf("armv7\n");
 			break;
 		case CPU_ARMV6:
    			printf("armv6\n");
 			break;
 	}
 }
 void get_features(void)
 {
 #ifdef linux
 	FILE *infile;
  	char buffer[2048], *p,*t;
  	p = (char *) NULL ;
  	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 		if (!strncmp("Features", buffer, 8))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
      		}
  	}
  	fclose(infile);
 	if( p == NULL ) return;
 	t = strtok(p," ");
 	while( t = strtok(NULL," "))
 	{	
 		if (!strcmp(t, "vfp"))   { printf("HAVE_VFP=1\n"); continue; }
 		if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; }
 		if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; }
 		if (!strcmp(t, "neon"))  { printf("HAVE_NEON=1\n"); continue; }
 	}
 #endif
 	return;
 }
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@ -1243,6 +1243,7 @@ static char *cpuname[] = {
  "BOBCAT",
  "BULLDOZER",
  "PILEDRIVER",
  "HASWELL",
 };
 static char *lowercpuname[] = {
@ -1293,6 +1294,7 @@ static char *lowercpuname[] = {
  "bobcat",
  "bulldozer",
  "piledriver",
  "haswell",
 };
 static char *corename[] = {
@ -1320,6 +1322,7 @@ static char *corename[] = {
  "BOBCAT",
  "BULLDOZER",
  "PILEDRIVER",
  "HASWELL",
 };
 static char *corename_lower[] = {
@ -1347,6 +1350,7 @@ static char *corename_lower[] = {
  "bobcat",
  "bulldozer",
  "piledriver",
  "haswell",
 };
--- a/ctest.c
+++ b/ctest.c
@ -124,3 +124,12 @@ ARCH_IA64
 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
 BINARY_64
 #endif
 #if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__)
 ARCH_ARM
 #endif
 #if defined(__aarch64__)
 ARCH_ARM64
 #endif
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
+#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
        if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
        else
                if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
                else
                        if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;
-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
+#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
 	if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
 	else
 		if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 		else
 			if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@ -65,14 +65,15 @@ extern gotoblas_t  gotoblas_BOBCAT;
 extern gotoblas_t  gotoblas_SANDYBRIDGE;
 extern gotoblas_t  gotoblas_BULLDOZER;
 extern gotoblas_t  gotoblas_PILEDRIVER;
 extern gotoblas_t  gotoblas_HASWELL;
 #else
 //Use NEHALEM kernels for sandy bridge
 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
 #define gotoblas_HASWELL gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
 #endif
-//Use sandy bridge kernels for haswell.
+
 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
 #define VENDOR_INTEL      1
 #define VENDOR_AMD        2
@ -297,6 +298,7 @@ static char *corename[] = {
    "Bobcat",
    "Bulldozer",
    "Piledriver",
    "Haswell",
 };
 char *gotoblas_corename(void) {
@ -319,7 +321,8 @@ char *gotoblas_corename(void) {
  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
-  if (gotoblas == &gotoblas_PILEDRIVER)    return corename[19];
+  if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
  if (gotoblas == &gotoblas_HASWELL)      return corename[20];
  return corename[0];
 }
--- a/getarch.c
+++ b/getarch.c
@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "SANDYBRIDGE"
 #endif
 #ifdef FORCE_HASWELL
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
                     "-DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #endif
 #ifdef FORCE_ATOM
 #define FORCE
 #define FORCE_INTEL
@ -679,6 +694,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "generic"
 #endif
 #ifdef FORCE_ARMV7
 #define FORCE
 #define ARCHITECTURE    "ARM"
 #define SUBARCHITECTURE "ARMV7"
 #define SUBDIRNAME      "arm"
 #define ARCHCONFIG   "-DARMV7 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
       "-DHAVE_VFPV3 -DHAVE_VFP"
 #define LIBNAME   "armv7"
 #define CORENAME  "ARMV7"
 #else
 #endif
 #ifdef FORCE_ARMV6
 #define FORCE
 #define ARCHITECTURE    "ARM"
 #define SUBARCHITECTURE "ARMV6"
 #define SUBDIRNAME      "arm"
 #define ARCHCONFIG   "-DARMV6 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
       "-DHAVE_VFP"
 #define LIBNAME   "armv6"
 #define CORENAME  "ARMV6"
 #else
 #endif
 #ifdef FORCE_ARMV8
 #define FORCE
 #define ARCHITECTURE    "ARM64"
 #define SUBARCHITECTURE "ARMV8"
 #define SUBDIRNAME      "arm64"
 #define ARCHCONFIG   "-DARMV8 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
 #define LIBNAME   "armv8"
 #define CORENAME  "ARMV8"
 #else
 #endif
 #ifndef FORCE
 #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
@ -719,6 +780,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED
 #endif
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
 #endif
@ -773,7 +840,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
    printf("CORE=%s\n", CORENAME);
 #else    
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
    printf("CORE=%s\n", get_corename());
 #endif
 #endif
@ -788,6 +855,11 @@ int main(int argc, char *argv[]){
    printf("NUM_CORES=%d\n", get_num_cores());
 #if defined(__arm__) && !defined(FORCE)
        get_features();
 #endif
 #if defined(__i386__) || defined(__x86_64__)
 #ifndef FORCE
    get_sse();
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS)
 USE_GEMM3M = 1
 endif
 ifeq ($(ARCH), arm)
 USE_TRMM = 1
 endif
 ifeq ($(ARCH), arm64)
 USE_TRMM = 1
 endif
 ifeq ($(TARGET), LOONGSON3B)												 
 USE_TRMM = 1
 endif
 SKERNELOBJS	+= \
 	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-ifeq ($(TARGET), LOONGSON3B)												 
+
 ifdef USE_TRMM											 
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -582,24 +597,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 else
 ifdef STRMMKERNEL
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 else
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -613,79 +610,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL
 ifdef DTRMMKERNEL_LN
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_LT
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_RN
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_RT
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 endif
 else
 ifdef DTRMMKERNEL_LN
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_LT
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_RN
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 endif
 ifdef DTRMMKERNEL_RT
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 else
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 endif
 endif
 ifdef QTRMMKERNEL
 $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -699,50 +634,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 else
 $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
 $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
 $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 endif
 ifdef CTRMMKERNEL
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 else
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -767,37 +658,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif
 ifdef ZTRMMKERNEL
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 else
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -821,37 +681,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif
 endif
 ifdef XTRMMKERNEL
 $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
 $(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
 $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
 $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 else
 $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -877,9 +710,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
 $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif
 $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
--- a/kernel/arm/KERNEL
+++ b/kernel/arm/KERNEL
@ -0,0 +1,46 @@
 ifndef SNRM2KERNEL
 SNRM2KERNEL = nrm2.c
 endif
 ifndef DNRM2KERNEL
 DNRM2KERNEL = nrm2.c
 endif
 ifndef CNRM2KERNEL
 CNRM2KERNEL = znrm2.c
 endif
 ifndef ZNRM2KERNEL
 ZNRM2KERNEL = znrm2.c
 endif
 ifndef SCABS_KERNEL
 SCABS_KERNEL	= ../generic/cabs.c
 endif
 ifndef DCABS_KERNEL
 DCABS_KERNEL	= ../generic/cabs.c
 endif
 ifndef QCABS_KERNEL
 QCABS_KERNEL	= ../generic/cabs.c
 endif
 ifndef LSAME_KERNEL
 LSAME_KERNEL	= ../generic/lsame.c
 endif
 ifndef SGEMM_BETA
 SGEMM_BETA = ../generic/gemm_beta.c
 endif
 ifndef DGEMM_BETA
 DGEMM_BETA = ../generic/gemm_beta.c
 endif
 ifndef CGEMM_BETA
 CGEMM_BETA = ../generic/zgemm_beta.c
 endif
 ifndef ZGEMM_BETA
 ZGEMM_BETA = ../generic/zgemm_beta.c
 endif
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@ -0,0 +1,142 @@
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
 ZAMAXKERNEL  = iamax_vfp.S
 SAMINKERNEL  = iamax_vfp.S
 DAMINKERNEL  = iamax_vfp.S
 CAMINKERNEL  = iamax_vfp.S
 ZAMINKERNEL  = iamax_vfp.S
 SMAXKERNEL   = iamax_vfp.S
 DMAXKERNEL   = iamax_vfp.S
 SMINKERNEL   = iamax_vfp.S
 DMINKERNEL   = iamax_vfp.S
 ISAMAXKERNEL = iamax_vfp.S
 IDAMAXKERNEL = iamax_vfp.S
 ICAMAXKERNEL = iamax_vfp.S
 IZAMAXKERNEL = iamax_vfp.S
 ISAMINKERNEL = iamax_vfp.S
 IDAMINKERNEL = iamax_vfp.S
 ICAMINKERNEL = iamax_vfp.S
 IZAMINKERNEL = iamax_vfp.S
 ISMAXKERNEL  = iamax_vfp.S
 IDMAXKERNEL  = iamax_vfp.S
 ISMINKERNEL  = iamax_vfp.S
 IDMINKERNEL  = iamax_vfp.S
 SASUMKERNEL  = asum_vfp.S
 DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S
 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S
 SCOPYKERNEL  = scopy_vfp.S
 DCOPYKERNEL  = dcopy_vfp.S
 CCOPYKERNEL  = ccopy_vfp.S
 ZCOPYKERNEL  = zcopy_vfp.S
 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
 ZDOTKERNEL   = zdot_vfp.S
 SNRM2KERNEL  = nrm2_vfp.S
 DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
 ZNRM2KERNEL  = nrm2_vfp.S
 SROTKERNEL   = rot_vfp.S
 DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S
 SSCALKERNEL  =  scal_vfp.S
 DSCALKERNEL  =  scal_vfp.S
 CSCALKERNEL  =  scal_vfp.S
 ZSCALKERNEL  =  scal_vfp.S
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
 ZSWAPKERNEL  = swap_vfp.S
 SGEMVNKERNEL = gemv_n_vfp.S
 DGEMVNKERNEL = gemv_n_vfp.S
 CGEMVNKERNEL = cgemv_n_vfp.S
 ZGEMVNKERNEL = zgemv_n_vfp.S
 SGEMVTKERNEL = gemv_t_vfp.S
 DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S
 STRMMKERNEL	= strmm_kernel_4x2_vfp.S
 DTRMMKERNEL	= dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL	= ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL	= ztrmm_kernel_2x2_vfp.S
 SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S		
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
 SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
 SGEMMINCOPYOBJ = sgemm_incopy.o
 SGEMMITCOPYOBJ = sgemm_itcopy.o
 SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S		
 DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
 DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
 DGEMMINCOPYOBJ = dgemm_incopy.o
 DGEMMITCOPYOBJ = dgemm_itcopy.o
 DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 CGEMMKERNEL    = cgemm_kernel_2x2_vfp.S
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
 CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfp.S
 ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
 ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@ -0,0 +1,141 @@
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
 ZAMAXKERNEL  = iamax_vfp.S
 SAMINKERNEL  = iamax_vfp.S
 DAMINKERNEL  = iamax_vfp.S
 CAMINKERNEL  = iamax_vfp.S
 ZAMINKERNEL  = iamax_vfp.S
 SMAXKERNEL   = iamax_vfp.S
 DMAXKERNEL   = iamax_vfp.S
 SMINKERNEL   = iamax_vfp.S
 DMINKERNEL   = iamax_vfp.S
 ISAMAXKERNEL = iamax_vfp.S
 IDAMAXKERNEL = iamax_vfp.S
 ICAMAXKERNEL = iamax_vfp.S
 IZAMAXKERNEL = iamax_vfp.S
 ISAMINKERNEL = iamax_vfp.S
 IDAMINKERNEL = iamax_vfp.S
 ICAMINKERNEL = iamax_vfp.S
 IZAMINKERNEL = iamax_vfp.S
 ISMAXKERNEL  = iamax_vfp.S
 IDMAXKERNEL  = iamax_vfp.S
 ISMINKERNEL  = iamax_vfp.S
 IDMINKERNEL  = iamax_vfp.S
 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
 ZSWAPKERNEL  = swap_vfp.S
 SASUMKERNEL  = asum_vfp.S
 DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S
 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S
 SCOPYKERNEL  = scopy_vfp.S
 DCOPYKERNEL  = dcopy_vfp.S
 CCOPYKERNEL  = ccopy_vfp.S
 ZCOPYKERNEL  = zcopy_vfp.S
 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
 ZDOTKERNEL   = zdot_vfp.S
 SNRM2KERNEL  = nrm2_vfpv3.S
 DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S
 SROTKERNEL   = rot_vfp.S
 DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S
 SSCALKERNEL  = scal_vfp.S
 DSCALKERNEL  = scal_vfp.S
 CSCALKERNEL  = scal_vfp.S
 ZSCALKERNEL  = scal_vfp.S
 SGEMVNKERNEL = gemv_n_vfp.S
 DGEMVNKERNEL = gemv_n_vfp.S
 CGEMVNKERNEL = cgemv_n_vfp.S
 ZGEMVNKERNEL = zgemv_n_vfp.S
 SGEMVTKERNEL = gemv_t_vfp.S
 DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S
 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S		
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S
 #SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c		
 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S		
 SGEMMINCOPY    =  
 SGEMMITCOPY    = 
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
 SGEMMOTCOPY    =  sgemm_tcopy_4_vfp.S
 SGEMMINCOPYOBJ = 
 SGEMMITCOPYOBJ = 
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 DGEMMKERNEL    =  dgemm_kernel_4x4_vfpv3.S		
 DGEMMINCOPY    =  
 DGEMMITCOPY    =  
 DGEMMONCOPY    =  dgemm_ncopy_4_vfp.S
 DGEMMOTCOPY    =  dgemm_tcopy_4_vfp.S
 DGEMMINCOPYOBJ = 
 DGEMMITCOPYOBJ = 
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
 CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
 ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
 ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
--- a/kernel/arm/Makefile
+++ b/kernel/arm/Makefile
@ -0,0 +1,2 @@
 clean ::
--- a/kernel/arm/amax.c
+++ b/kernel/arm/amax.c
@ -0,0 +1,73 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	if (n < 0 || inc_x < 1 ) return(maxf);
 	maxf=ABS(x[0]);
 	while(i < n)
 	{
 		if( ABS(x[ix]) > ABS(maxf) ) 
 		{
 			maxf = ABS(x[ix]);
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(maxf);
 }
--- a/kernel/arm/amin.c
+++ b/kernel/arm/amin.c
@ -0,0 +1,73 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	if (n < 0 || inc_x < 1 ) return(minf);
 	minf=ABS(x[0]);
 	while(i < n)
 	{
 		if( ABS(x[ix]) < ABS(minf) ) 
 		{
 			minf = ABS(x[ix]);
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(minf);
 }
--- a/kernel/arm/asum.c
+++ b/kernel/arm/asum.c
@ -0,0 +1,67 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	if (n < 0 || inc_x < 1 ) return(sumf);
 	n *= inc_x;
 	while(i < n)
 	{
 		sumf += ABS(x[i]);
 		i += inc_x;
 	}
 	return(sumf);
 }
--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@ -0,0 +1,481 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/11 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	fldmiad	X!, { d4 - d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	fldmiad	X!, { d6 - d7 }
 	vabs.f64   d6,  d6
 	vadd.f64   d1  , d1,  d5
 	vabs.f64   d7,  d7
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 .endm
 .macro KERNEL_S4
 	fldmiad	X, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	fldmiad	X, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	fldmiad	X, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	fldmiad	X, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F4
 	fldmias	X!, { s4 - s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	fldmias	X!, { s6 - s7 }
 	vabs.f32   s6,  s6
 	vadd.f32   s1  , s1,  s5
 	vabs.f32   s7,  s7
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 .endm
 .macro KERNEL_S4
 	fldmias	X, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X
 	fldmias	X, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X
 	fldmias	X, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X
 	fldmias	X, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	fldmiad	X!, { d4 - d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	fldmiad	X!, { d6 - d7 }
 	vabs.f64   d6,  d6
 	vadd.f64   d1  , d1,  d5
 	vabs.f64   d7,  d7
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7
 	pld	[ X, #X_PRE  ]
 	fldmiad	X!, { d4 - d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	fldmiad	X!, { d6 - d7 }
 	vabs.f64   d6,  d6
 	vadd.f64   d1  , d1,  d5
 	vabs.f64   d7,  d7
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	fldmiad	X!, { d4 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 .endm
 .macro KERNEL_S4
 	fldmiad	X, { d4 -d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X
 	fldmiad	X, { d4 -d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X
 	fldmiad	X, { d4 -d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X
 	fldmiad	X, { d4 -d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 -d5 }
 	vabs.f64   d4,  d4
 	vadd.f64   d0  , d0,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	fldmias	X!, { s4 - s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	fldmias	X!, { s6 - s7 }
 	vabs.f32   s6,  s6
 	vadd.f32   s1  , s1,  s5
 	vabs.f32   s7,  s7
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7
 	fldmias	X!, { s4 - s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	fldmias	X!, { s6 - s7 }
 	vabs.f32   s6,  s6
 	vadd.f32   s1  , s1,  s5
 	vabs.f32   s7,  s7
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	fldmias	X!, { s4 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 .endm
 .macro KERNEL_S4
 	fldmias	X, { s4 -s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X
 	fldmias	X, { s4 -s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X
 	fldmias	X, { s4 -s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X
 	fldmias	X, { s4 -s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 -s5 }
 	vabs.f32   s4,  s4
 	vadd.f32   s0  , s0,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0
 	vsub.f64                d1 , d1 , d1
 #else
 	vsub.f32                s0 , s0 , s0
 	vsub.f32                s1 , s1 , s1
 #endif
 	cmp	N, #0
 	ble	asum_kernel_L999
 	cmp	INC_X, #0
 	beq	asum_kernel_L999
 	cmp	INC_X, #1
 	bne	asum_kernel_S_BEGIN
 asum_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	asum_kernel_F1
 	.align 5
 asum_kernel_F4:
 #if !defined(DOUBLE) && !defined(COMPLEX)
 	pld	[ X, #X_PRE  ]
 #endif
 	KERNEL_F4
 	subs	I, I, #1
 	ble	asum_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	asum_kernel_F4
 asum_kernel_F1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     asum_kernel_F10
 	b	asum_kernel_L999
 asum_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 	asrs	I, N, #2					// I = N / 4
 	ble	asum_kernel_S1
 	.align 5
 asum_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	asum_kernel_S4
 asum_kernel_S1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     asum_kernel_S10
 asum_kernel_L999:
 #if defined(DOUBLE)
 	vadd.f64	d0 , d0, d1				// set return value
 #else
 	vadd.f32	s0 , s0, s1				// set return value
 #endif
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/axpy.c
@ -0,0 +1,64 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 	if ( n < 0     )  return(0);
 	if ( da == 0.0 ) return(0);
 	ix = 0;
 	iy = 0;
 	while(i < n)
 	{
 		y[iy] += da * x[ix] ;
 		ix += inc_x ;
 		iy += inc_y ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@ -0,0 +1,503 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/14 Saar
 * 	 BLASTEST 		: xOK
 * 	 CTEST			: xOK
 * 	 TEST			: xOK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
 #define	N	r0
 #define Y	r1
 #define	INC_X	r2
 #define	X	r3
 #define INC_Y	r4
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 /*****************************************************************************************/
 #if !defined(CONJ)
 #if defined(DOUBLE)
 #define	FMAC_R1	fmacd
 #define FMAC_R2 fnmacd
 #define	FMAC_I1	fmacd
 #define FMAC_I2 fmacd
 #else
 #define	FMAC_R1	fmacs
 #define FMAC_R2 fnmacs
 #define	FMAC_I1	fmacs
 #define FMAC_I2 fmacs
 #endif
 #else	// CONJ
 #if defined(DOUBLE)
 #define	FMAC_R1	fmacd
 #define FMAC_R2 fmacd
 #define	FMAC_I1	fnmacd
 #define FMAC_I2 fmacd
 #else
 #define	FMAC_R1	fmacs
 #define FMAC_R2 fmacs
 #define	FMAC_I1	fnmacs
 #define FMAC_I2 fmacs
 #endif
 #endif
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmiad		X!,  { d4 - d7  }
 	pld	[ Y, #X_PRE ]
 	fldmiad		Y ,  { d8 - d11 }
 	fmacd   	d8 , d0, d4
 	fstmiad		Y!, { d8 }
 	fmacd   	d9 , d0, d5
 	fstmiad		Y!, { d9 }
 	fmacd   	d10, d0, d6
 	fstmiad		Y!, { d10 }
 	fmacd   	d11, d0, d7
 	fstmiad		Y!, { d11 }
 .endm
 .macro KERNEL_F1
 	fldmiad		X!,  { d4 }
 	fldmiad		Y ,  { d8 }
 	fmacd   	d8 , d0, d4
 	fstmiad		Y!, { d8 }
 .endm
 .macro KERNEL_S1
 	fldmiad		X ,  { d4 }
 	fldmiad		Y ,  { d8 }
 	fmacd   	d8 , d0, d4
 	fstmiad		Y , { d8 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	fldmias		X!,  { s4 - s7  }
 	fldmias		Y ,  { s8 - s11 }
 	fmacs   	s8 , s0, s4
 	fstmias		Y!, { s8 }
 	fmacs   	s9 , s0, s5
 	fstmias		Y!, { s9 }
 	fmacs   	s10, s0, s6
 	fstmias		Y!, { s10 }
 	fmacs   	s11, s0, s7
 	fstmias		Y!, { s11 }
 .endm
 .macro KERNEL_F1
 	fldmias		X!,  { s4 }
 	fldmias		Y ,  { s8 }
 	fmacs   	s8 , s0, s4
 	fstmias		Y!, { s8 }
 .endm
 .macro KERNEL_S1
 	fldmias		X ,  { s4 }
 	fldmias		Y ,  { s8 }
 	fmacs   	s8 , s0, s4
 	fstmias		Y , { s8 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmiad		X!,  { d4 - d7  }
 	pld	[ Y, #X_PRE ]
 	fldmiad		Y ,  { d8 - d11 }
 	FMAC_R1		d8 , d0, d4
 	FMAC_R2		d8 , d1, d5
 	FMAC_I1		d9 , d0, d5
 	FMAC_I2		d9 , d1, d4
 	fstmiad		Y!, { d8 }
 	fstmiad		Y!, { d9 }
 	FMAC_R1		d10, d0, d6
 	FMAC_R2		d10, d1, d7
 	FMAC_I1		d11, d0, d7
 	FMAC_I2		d11, d1, d6
 	fstmiad		Y!, { d10 }
 	fstmiad		Y!, { d11 }
 	pld	[ X, #X_PRE ]
 	fldmiad		X!,  { d4 - d7  }
 	pld	[ Y, #X_PRE ]
 	fldmiad		Y ,  { d8 - d11 }
 	FMAC_R1		d8 , d0, d4
 	FMAC_R2		d8 , d1, d5
 	FMAC_I1		d9 , d0, d5
 	FMAC_I2		d9 , d1, d4
 	fstmiad		Y!, { d8 }
 	fstmiad		Y!, { d9 }
 	FMAC_R1		d10, d0, d6
 	FMAC_R2		d10, d1, d7
 	FMAC_I1		d11, d0, d7
 	FMAC_I2		d11, d1, d6
 	fstmiad		Y!, { d10 }
 	fstmiad		Y!, { d11 }
 .endm
 .macro KERNEL_F1
 	fldmiad		X!,  { d4 - d5  }
 	fldmiad		Y ,  { d8 - d9 }
 	FMAC_R1		d8 , d0, d4
 	FMAC_R2		d8 , d1, d5
 	FMAC_I1		d9 , d0, d5
 	FMAC_I2		d9 , d1, d4
 	fstmiad		Y!, { d8 }
 	fstmiad		Y!, { d9 }
 .endm
 .macro KERNEL_S1
 	fldmiad		X ,  { d4 - d5 }
 	fldmiad		Y ,  { d8 - d9 }
 	FMAC_R1		d8 , d0, d4
 	FMAC_R2		d8 , d1, d5
 	FMAC_I1		d9 , d0, d5
 	FMAC_I2		d9 , d1, d4
 	fstmiad		Y  , { d8 - d9 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmias		X!,  { s4 - s7  }
 	pld	[ Y, #X_PRE ]
 	fldmias		Y ,  { s8 - s11 }
 	FMAC_R1		s8 , s0, s4
 	FMAC_R2		s8 , s1, s5
 	FMAC_I1		s9 , s0, s5
 	FMAC_I2		s9 , s1, s4
 	fstmias		Y!, { s8 }
 	fstmias		Y!, { s9 }
 	FMAC_R1		s10, s0, s6
 	FMAC_R2		s10, s1, s7
 	FMAC_I1		s11, s0, s7
 	FMAC_I2		s11, s1, s6
 	fstmias		Y!, { s10 }
 	fstmias		Y!, { s11 }
 	fldmias		X!,  { s4 - s7  }
 	fldmias		Y ,  { s8 - s11 }
 	FMAC_R1		s8 , s0, s4
 	FMAC_R2		s8 , s1, s5
 	FMAC_I1		s9 , s0, s5
 	FMAC_I2		s9 , s1, s4
 	fstmias		Y!, { s8 }
 	fstmias		Y!, { s9 }
 	FMAC_R1		s10, s0, s6
 	FMAC_R2		s10, s1, s7
 	FMAC_I1		s11, s0, s7
 	FMAC_I2		s11, s1, s6
 	fstmias		Y!, { s10 }
 	fstmias		Y!, { s11 }
 .endm
 .macro KERNEL_F1
 	fldmias		X!,  { s4 - s5  }
 	fldmias		Y ,  { s8 - s9 }
 	FMAC_R1		s8 , s0, s4
 	FMAC_R2		s8 , s1, s5
 	FMAC_I1		s9 , s0, s5
 	FMAC_I2		s9 , s1, s4
 	fstmias		Y!, { s8 }
 	fstmias		Y!, { s9 }
 .endm
 .macro KERNEL_S1
 	fldmias		X ,  { s4 - s5 }
 	fldmias		Y ,  { s8 - s9 }
 	FMAC_R1		s8 , s0, s4
 	FMAC_R2		s8 , s1, s5
 	FMAC_I1		s9 , s0, s5
 	FMAC_I2		s9 , s1, s4
 	fstmias		Y  , { s8 - s9 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 , fp}
        add     fp, sp, #8
 	sub     sp, sp, #STACKSIZE                              // reserve stack
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
 	sub     r12, fp, #128
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15}                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15}                                 // store floating point registers
 #endif
 	cmp	N, #0
 	ble	axpy_kernel_L999
 	cmp	INC_X, #0
 	beq	axpy_kernel_L999
 	cmp	INC_Y, #0
 	beq	axpy_kernel_L999
 	cmp	INC_X, #1
 	bne	axpy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	axpy_kernel_S_BEGIN
 axpy_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	axpy_kernel_F1
 	.align 5
 axpy_kernel_F4:
 #if !defined(COMPLEX) && !defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 #endif
 	KERNEL_F4
 	subs	I, I, #1
 	ble	axpy_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	axpy_kernel_F4
 axpy_kernel_F1:
 	ands	I, N, #3
 	ble	axpy_kernel_L999
 axpy_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     axpy_kernel_F10
 	b	axpy_kernel_L999
 axpy_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 #endif
 	asrs	I, N, #2					// I = N / 4
 	ble	axpy_kernel_S1
 	.align 5
 axpy_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	axpy_kernel_S4
 axpy_kernel_S1:
 	ands	I, N, #3
 	ble	axpy_kernel_L999
 axpy_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     axpy_kernel_S10
 axpy_kernel_L999:
 	sub     r3, fp, #128
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #8
 	pop     {r4,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/ccopy_vfp.S
+++ b/kernel/arm/ccopy_vfp.S
@ -0,0 +1,222 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY_F4
 	pld	[ X, #X_PRE  ]
 	fldmias	X!, { s0 - s7 }
 	fstmias	Y!, { s0 - s7 }
 .endm
 .macro COPY_F1
 	fldmias	X!, { s0 - s1 }
 	fstmias	Y!, { s0 - s1 }
 .endm
 /*************************************************************************************************************************/
 .macro COPY_S4
 	nop
 	fldmias	X, { s0 - s1 }
 	fstmias	Y, { s0 - s1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s2 - s3 }
 	fstmias	Y, { s2 - s3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s0 - s1 }
 	fstmias	Y, { s0 - s1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s2 - s3 }
 	fstmias	Y, { s2 - s3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro COPY_S1
 	fldmias	X, { s0 - s1 }
 	fstmias	Y, { s0 - s1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	cmp	N, #0
 	ble	ccopy_kernel_L999
 	cmp	INC_X, #0
 	beq	ccopy_kernel_L999
 	cmp	INC_Y, #0
 	beq	ccopy_kernel_L999
 	cmp	INC_X, #1
 	bne	ccopy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	ccopy_kernel_S_BEGIN
 ccopy_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	ccopy_kernel_F1
 ccopy_kernel_F4:
 	COPY_F4
 	subs	I, I, #1
 	bne	ccopy_kernel_F4
 ccopy_kernel_F1:
 	ands	I, N, #3
 	ble	ccopy_kernel_L999
 ccopy_kernel_F10:
 	COPY_F1
 	subs    I, I, #1
        bne     ccopy_kernel_F10
 	b	ccopy_kernel_L999
 ccopy_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 	asrs	I, N, #2					// I = N / 4
 	ble	ccopy_kernel_S1
 ccopy_kernel_S4:
 	COPY_S4
 	subs	I, I, #1
 	bne	ccopy_kernel_S4
 ccopy_kernel_S1:
 	ands	I, N, #3
 	ble	ccopy_kernel_L999
 ccopy_kernel_S10:
 	COPY_S1
 	subs    I, I, #1
        bne     ccopy_kernel_S10
 ccopy_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@ -0,0 +1,284 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/11 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	pld	[ Y, #X_PRE  ]
 	fldmias	X!, { s4 - s5 }
 	fldmias	Y!, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fldmias	X!, { s6 - s7 }
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	fldmias	Y!, { s10 - s11 }
 	fmacs   s0  , s6,  s10
 	fmacs   s1  , s6,  s11
 	fmacs   s2  , s7,  s11
 	fmacs   s3  , s7,  s10
 	fldmias	X!, { s4 - s5 }
 	fldmias	Y!, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fldmias	X!, { s6 - s7 }
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	fldmias	Y!, { s10 - s11 }
 	fmacs   s0  , s6,  s10
 	fmacs   s1  , s6,  s11
 	fmacs   s2  , s7,  s11
 	fmacs   s3  , s7,  s10
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 - s5 }
 	fldmias	Y!, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 .endm
 /*************************************************************************************************************************/
 .macro KERNEL_S4
 	nop
 	fldmias	X, { s4 - s5 }
 	fldmias	Y, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s4 - s5 }
 	fldmias	Y, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s4 - s5 }
 	fldmias	Y, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s4 - s5 }
 	fldmias	Y, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 - s5 }
 	fldmias	Y, { s8 - s9 }
 	fmacs   s0  , s4,  s8
 	fmacs   s1  , s4,  s9
 	fmacs   s2  , s5,  s9
 	fmacs   s3  , s5,  s8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	vsub.f32                s0 , s0 , s0
 	vsub.f32                s1 , s1 , s1
 	vsub.f32                s2 , s2 , s2
 	vsub.f32                s3 , s3 , s3
 	cmp	N, #0
 	ble	cdot_kernel_L999
 	cmp	INC_X, #0
 	beq	cdot_kernel_L999
 	cmp	INC_Y, #0
 	beq	cdot_kernel_L999
 	cmp	INC_X, #1
 	bne	cdot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	cdot_kernel_S_BEGIN
 cdot_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	cdot_kernel_F1
 cdot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	cdot_kernel_F4
 cdot_kernel_F1:
 	ands	I, N, #3
 	ble	cdot_kernel_L999
 cdot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     cdot_kernel_F10
 	b	cdot_kernel_L999
 cdot_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 	asrs	I, N, #2					// I = N / 4
 	ble	cdot_kernel_S1
 cdot_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	cdot_kernel_S4
 cdot_kernel_S1:
 	ands	I, N, #3
 	ble	cdot_kernel_L999
 cdot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     cdot_kernel_S10
 cdot_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 #if !defined(CONJ)
 	vsub.f32	s0 , s0, s2				
 	vadd.f32	s1 , s1, s3				
 #else
 	vadd.f32	s0 , s0, s2				
 	vsub.f32	s1 , s1, s3				
 #endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
--- a/kernel/arm/cgemm_ncopy_2_vfp.S
+++ b/kernel/arm/cgemm_ncopy_2_vfp.S
@ -0,0 +1,258 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/05 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDA	[fp, #-260 ]
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define I	r3
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO1, #4  ]
 	flds	s4 , [ AO1, #8 ]
 	flds	s5 , [ AO1, #12 ]
 	flds	s2 , [ AO2, #0  ]
 	flds	s3 , [ AO2, #4  ]
 	add	AO1, AO1, #16
 	flds	s6 , [ AO2, #8 ]
 	flds	s7 , [ AO2, #12 ]
 	fstmias	BO!, { s0 - s7 }
 	add	AO2, AO2, #16
 .endm
 .macro COPY1x2
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO1, #4  ]
 	flds	s2 , [ AO2, #0  ]
 	flds	s3 , [ AO2, #4  ]
 	add	AO1, AO1, #8
 	fstmias	BO!, { s0 - s3 }
 	add	AO2, AO2, #8
 .endm
 .macro COPY2x1
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO1, #4  ]
 	flds	s2 , [ AO1, #8 ]
 	flds	s3 , [ AO1, #12 ]
 	fstmias	BO!, { s0 - s3 }
 	add	AO1, AO1, #16
 .endm
 .macro COPY1x1
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO1, #4  ]
 	fstmias	BO!, { s0 - s1 }
 	add	AO1, AO1, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	lsl	r3, r3, #3					// lda = lda * 4 * 2
 	str	r3, LDA
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	ldr	BO, B	
 /*********************************************************************************************/
 cgemm_ncopy_L2_BEGIN:
 	asrs	J, N, #1					// J = N / 2
 	ble	cgemm_ncopy_L1_BEGIN
 cgemm_ncopy_L2_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	A  , AO2, r4 					// A = A + 2 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	cgemm_ncopy_L2_M2_40
 cgemm_ncopy_L2_M2_20:
 	pld	[ AO1, #A_PRE ]
 	pld	[ AO2, #A_PRE ]
 	COPY2x2
 	subs	I , I , #1
 	ble	cgemm_ncopy_L2_M2_40
 	COPY2x2
 	subs	I , I , #1
 	bne	cgemm_ncopy_L2_M2_20
 cgemm_ncopy_L2_M2_40:
 	ands	I, M , #1
 	ble	cgemm_ncopy_L2_M2_END
 cgemm_ncopy_L2_M2_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	cgemm_ncopy_L2_M2_60
 cgemm_ncopy_L2_M2_END:
 	subs    J , J, #1                                               // j--
        bne     cgemm_ncopy_L2_M2_BEGIN
 /*********************************************************************************************/
 cgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	cgemm_ncopy_L999
 cgemm_ncopy_L1_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	A  , AO1, r4 					// A = A + 1 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	cgemm_ncopy_L1_M2_40
 cgemm_ncopy_L1_M2_20:
 	COPY2x1
 	subs	I , I , #1
 	bne	cgemm_ncopy_L1_M2_20
 cgemm_ncopy_L1_M2_40:
 	ands	I, M , #1
 	ble	cgemm_ncopy_L1_M2_END
 cgemm_ncopy_L1_M2_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	cgemm_ncopy_L1_M2_60
 cgemm_ncopy_L1_M2_END:
 cgemm_ncopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/cgemm_tcopy_2_vfp.S
+++ b/kernel/arm/cgemm_tcopy_2_vfp.S
@ -0,0 +1,243 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define B	[fp, #4 ]
 #define A	[fp, #-248 ]
 #define M	r0
 #define N	r1
 #define M4	r2
 #define	LDA	r5
 #define	AO1	r6
 #define	BO1	r7
 #define	BO2	r8
 #define I	r4
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	fldmias	AO1, { s0 - s3 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s4 - s7 }
 	fstmias	BO1, { s0 - s7 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY1x2
 	fldmias	AO1, { s0 -s1 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s2 - s3 }
 	fstmias	BO2, { s0 - s3 }
 	add	AO1, AO1, #8
 	add	BO2, BO2, #16
 .endm
 /*************************************************************************************************************************/
 .macro COPY2x1
 	fldmias	AO1, { s0 - s3 }
 	fstmias	BO1, { s0 - s3 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY1x1
 	fldmias	AO1, { s0 - s1 }
 	fstmias	BO2, { s0 - s1 }
 	add	AO1, AO1, #8
 	add	BO2, BO2, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_A, A					// store A
 	lsl	LDA, OLD_LDA, #3				// lda = lda * SIZE * 2
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	lsl	r4 , M, #3					// M * SIZE * 2
 	ldr	r3, B
 	and	BO2 , N , #-2
 	mul	BO2, BO2, r4
 	add	BO2 , BO2, r3
 	lsl	M4, M, #4					// M4 = M * 2 * SIZE * 2
 cgemm_tcopy_L2_BEGIN:
 	asrs	J, M, #1					// J = N / 2
 	ble	cgemm_tcopy_L1_BEGIN
 cgemm_tcopy_L2_M2_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #1					// r3 = 2 * LDA
 	add	r3, r3 , AO1					// A = A + 2 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #32					// B = B + 4 * SIZE *2
 	str	r3, B
 	asrs	I, N, #1					// I = M / 2
 	ble	cgemm_tcopy_L2_M2_60
 cgemm_tcopy_L2_M2_40:
 	COPY2x2
 	subs I, I, #1
 	bne	cgemm_tcopy_L2_M2_40
 cgemm_tcopy_L2_M2_60:
 	tst	N , #1
 	ble	cgemm_tcopy_L2_M2_END
 	COPY1x2
 cgemm_tcopy_L2_M2_END:
 	subs	J , J, #1						// j--
 	bne	cgemm_tcopy_L2_M2_BEGIN
 /*********************************************************************************************/
 cgemm_tcopy_L1_BEGIN:
 	tst	M, #1
 	ble	cgemm_tcopy_L999
 cgemm_tcopy_L1_M2_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	add	r3, LDA , AO1					// A = A + 1 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #16					// B = B + 2 * SIZE *2
 	str	r3, B
 	asrs	I, N, #1					// I = M / 2
 	ble	cgemm_tcopy_L1_M2_60
 cgemm_tcopy_L1_M2_40:
 	COPY2x1
 	subs I, I, #1
 	bne	cgemm_tcopy_L1_M2_40
 cgemm_tcopy_L1_M2_60:
 	tst	N , #1
 	ble	cgemm_tcopy_L1_M2_END
 	COPY1x1
 cgemm_tcopy_L1_M2_END:
 cgemm_tcopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@ -0,0 +1,697 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/29 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_M		r0
 #define AO1	r0
 #define N	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define ALPHA_I [fp, #-236]
 #define ALPHA_R [fp, #-244]
 #define M	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	64
 #define Y_PRE	0
 #define A_PRE	0
 /**************************************************************************************/
 #if !defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fnmacs
        #define KMAC_I  fmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs
 #elif defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs
 #elif !defined(CONJ) && defined(XCONJ)
        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I2 fmacs
 #else
        #define KMAC_R  fnmacs
        #define KMAC_I  fmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I2 fmacs
 #endif
 .macro INIT_F4
 	pld	[ YO, #Y_PRE ]
        vsub.f32                s8 , s8 , s8
        vmov.f32                s9 , s8
        vmov.f32                s10, s8
        vmov.f32                s11, s8
        vmov.f32                s12, s8
        vmov.f32                s13, s8
        vmov.f32                s14, s8
        vmov.f32                s15, s8
 .endm
 .macro KERNEL_F4X4
 	pld	[ XO, #X_PRE ]
 	KERNEL_F4X1
 	KERNEL_F4X1
 	KERNEL_F4X1
 	KERNEL_F4X1
 .endm
 .macro KERNEL_F4X1
 	pld	[ AO2, #A_PRE ]
        flds    s0 , [ AO1 ]
        flds    s1 , [ AO1, #4  ]
        flds    s2 , [ AO1, #8 ]
        flds    s3 , [ AO1, #12 ]
        flds    s4 , [ XO ]
        flds    s5 , [ XO, #4 ]
        fmacs   s8  , s0,  s4
        fmacs   s9  , s0,  s5
        fmacs   s10 , s2,  s4
        fmacs   s11 , s2,  s5
        KMAC_R  s8  , s1,  s5
        KMAC_I  s9  , s1,  s4
        KMAC_R  s10 , s3,  s5
        KMAC_I  s11 , s3,  s4
        flds    s0 , [ AO1, #16 ]
        flds    s1 , [ AO1, #20 ]
        flds    s2 , [ AO1, #24 ]
        flds    s3 , [ AO1, #28 ]
        fmacs   s12 , s0,  s4
        fmacs   s13 , s0,  s5
        fmacs   s14 , s2,  s4
        fmacs   s15 , s2,  s5
        KMAC_R  s12 , s1,  s5
        KMAC_I  s13 , s1,  s4
        KMAC_R  s14 , s3,  s5
        KMAC_I  s15 , s3,  s4
        add     XO , XO, #8
        add     AO1 , AO1, LDA
        add     AO2 , AO2, LDA
 .endm
 .macro SAVE_F4
        flds            s0, ALPHA_R
        flds            s1, ALPHA_I
        fldmias YO, { s4 - s7 }
        FMAC_R1 s4 , s0 , s8
        FMAC_I1 s5 , s0 , s9
        FMAC_R2 s4 , s1 , s9
        FMAC_I2 s5 , s1 , s8
        FMAC_R1 s6 , s0 , s10
        FMAC_I1 s7 , s0 , s11
        FMAC_R2 s6 , s1 , s11
        FMAC_I2 s7 , s1 , s10
        fstmias YO!, { s4 - s7 }
        fldmias YO, { s4 - s7 }
        FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
        FMAC_R1 s6 , s0 , s14
        FMAC_I1 s7 , s0 , s15
        FMAC_R2 s6 , s1 , s15
        FMAC_I2 s7 , s1 , s14
        fstmias YO!, { s4 - s7 }
 .endm
 .macro INIT_F1
        vsub.f32                s8 , s8 , s8
        vmov.f32                s9 , s8
 .endm
 .macro KERNEL_F1X1
        flds    s0 , [ AO1 ]
        flds    s1 , [ AO1, #4 ]
        flds    s4 , [ XO ]
        flds    s5 , [ XO, #4 ]
        fmacs   s8  , s0,  s4
        fmacs   s9  , s0,  s5
        KMAC_R  s8  , s1,  s5
        KMAC_I  s9  , s1,  s4
        add     XO , XO, #8
        add     AO1 , AO1, LDA
 .endm
 .macro SAVE_F1
        flds            s0, ALPHA_R
        flds            s1, ALPHA_I
        fldmias YO, { s4 - s5 }
        FMAC_R1 s4 , s0 , s8
        FMAC_I1 s5 , s0 , s9
        FMAC_R2 s4 , s1 , s9
        FMAC_I2 s5 , s1 , s8
        fstmias YO, { s4 - s5 }
        add     YO, YO, #8
 .endm
 /****************************************************************************************/
 .macro INIT_S4
        vsub.f32                s8 , s8 , s8
        vmov.f32                s9 , s8
        vmov.f32                s10, s8
        vmov.f32                s11, s8
        vmov.f32                s12, s8
        vmov.f32                s13, s8
        vmov.f32                s14, s8
        vmov.f32                s15, s8
 .endm
 .macro KERNEL_S4X4
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 .endm
 .macro KERNEL_S4X1
        flds    s0 , [ AO1 ]
        flds    s1 , [ AO1, #4  ]
        flds    s2 , [ AO1, #8 ]
        flds    s3 , [ AO1, #12 ]
        flds    s4 , [ XO ]
        flds    s5 , [ XO, #4 ]
        fmacs   s8  , s0,  s4
        fmacs   s9  , s0,  s5
        fmacs   s10 , s2,  s4
        fmacs   s11 , s2,  s5
        KMAC_R  s8  , s1,  s5
        KMAC_I  s9  , s1,  s4
        KMAC_R  s10 , s3,  s5
        KMAC_I  s11 , s3,  s4
        flds    s0 , [ AO1, #16 ]
        flds    s1 , [ AO1, #20 ]
        flds    s2 , [ AO1, #24 ]
        flds    s3 , [ AO1, #28 ]
        fmacs   s12 , s0,  s4
        fmacs   s13 , s0,  s5
        fmacs   s14 , s2,  s4
        fmacs   s15 , s2,  s5
        KMAC_R  s12 , s1,  s5
        KMAC_I  s13 , s1,  s4
        KMAC_R  s14 , s3,  s5
        KMAC_I  s15 , s3,  s4
        add     XO , XO, INC_X
        add     AO1 , AO1, LDA
        add     AO2 , AO2, LDA
 .endm
 .macro SAVE_S4
        flds            s0, ALPHA_R
        flds            s1, ALPHA_I
        fldmias YO, { s4 - s5 }
        FMAC_R1 s4 , s0 , s8
        FMAC_I1 s5 , s0 , s9
        FMAC_R2 s4 , s1 , s9
        FMAC_I2 s5 , s1 , s8
        fstmias YO, { s4 - s5 }
 	add	YO, YO, INC_Y
        fldmias YO, { s6 - s7 }
        FMAC_R1 s6 , s0 , s10
        FMAC_I1 s7 , s0 , s11
        FMAC_R2 s6 , s1 , s11
        FMAC_I2 s7 , s1 , s10
        fstmias YO, { s6 - s7 }
 	add	YO, YO, INC_Y
        fldmias YO, { s4 - s5 }
        FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
        fstmias YO, { s4 - s5 }
 	add	YO, YO, INC_Y
        fldmias YO, { s6 - s7 }
        FMAC_R1 s6 , s0 , s14
        FMAC_I1 s7 , s0 , s15
        FMAC_R2 s6 , s1 , s15
        FMAC_I2 s7 , s1 , s14
        fstmias YO, { s6 - s7 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
        vsub.f32                s8 , s8 , s8
        vmov.f32                s9 , s8
 .endm
 .macro KERNEL_S1X1
        flds    s0 , [ AO1 ]
        flds    s1 , [ AO1, #4 ]
        flds    s4 , [ XO ]
        flds    s5 , [ XO, #4 ]
        fmacs   s8  , s0,  s4
        fmacs   s9  , s0,  s5
        KMAC_R  s8  , s1,  s5
        KMAC_I  s9  , s1,  s4
        add     XO , XO, INC_X
        add     AO1 , AO1, LDA
 .endm
 .macro SAVE_S1
        flds            s0, ALPHA_R
        flds            s1, ALPHA_I
        fldmias YO, { s4 - s5 }
        FMAC_R1 s4 , s0 , s8
        FMAC_I1 s5 , s0 , s9
        FMAC_R2 s4 , s1 , s9
        FMAC_I2 s5 , s1 , s8
        fstmias YO, { s4 - s5 }
        add     YO, YO, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	OLD_M, #0
 	ble	cgemvn_kernel_L999
 	cmp	N, #0
 	ble	cgemvn_kernel_L999
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    s0 , ALPHA_R
        vstr    s1 , ALPHA_I
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	cgemvn_kernel_L999
 	cmp	INC_Y, #0
 	beq	cgemvn_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #4				// LDA * SIZE * 2
 #else
 	lsl	LDA, LDA, #3				// LDA * SIZE * 2
 #endif
 	cmp	INC_X, #1
 	bne	cgemvn_kernel_S4_BEGIN
 	cmp	INC_Y, #1
 	bne	cgemvn_kernel_S4_BEGIN
 cgemvn_kernel_F4_BEGIN:
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #2					// I = M / 4
 	ble	cgemvn_kernel_F1_BEGIN
 cgemvn_kernel_F4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #32
 	str	r3 , A
 	add	AO2, AO2, LDA
 	add	AO2, AO2, LDA
 	ldr	XO , X
 	INIT_F4
 	asrs	J, N, #2					// J = N / 4
 	ble	cgemvn_kernel_F4X1
 cgemvn_kernel_F4X4_10:
 	KERNEL_F4X4
 	subs	J, J, #1
 	bne	cgemvn_kernel_F4X4_10
 cgemvn_kernel_F4X1:
 	ands	J, N , #3
 	ble	cgemvn_kernel_F4_END
 cgemvn_kernel_F4X1_10:
 	KERNEL_F4X1
 	subs	J, J, #1
 	bne	cgemvn_kernel_F4X1_10
 cgemvn_kernel_F4_END:
 	SAVE_F4
 	subs	I , I , #1
 	bne	cgemvn_kernel_F4X4
 cgemvn_kernel_F1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #3
 	ble	cgemvn_kernel_L999
 cgemvn_kernel_F1X1:
 	ldr	AO1, A
 	add	r3, AO1, #8
 	str	r3, A
 	ldr	XO , X
 	INIT_F1
 	mov	J, N
 cgemvn_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	J, J, #1
 	bne	cgemvn_kernel_F1X1_10
 cgemvn_kernel_F1_END:
 	SAVE_F1
 	subs	I , I , #1
 	bne	cgemvn_kernel_F1X1
 	b	cgemvn_kernel_L999
 /*************************************************************************************************************/
 cgemvn_kernel_S4_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 #endif
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #2					// I = M / 4
 	ble	cgemvn_kernel_S1_BEGIN
 cgemvn_kernel_S4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #32
 	str	r3 , A
 	ldr	XO , X
 	INIT_S4
 	asrs	J, N, #2					// J = N / 4
 	ble	cgemvn_kernel_S4X1
 cgemvn_kernel_S4X4_10:
 	KERNEL_S4X4
 	subs	J, J, #1
 	bne	cgemvn_kernel_S4X4_10
 cgemvn_kernel_S4X1:
 	ands	J, N , #3
 	ble	cgemvn_kernel_S4_END
 cgemvn_kernel_S4X1_10:
 	KERNEL_S4X1
 	subs	J, J, #1
 	bne	cgemvn_kernel_S4X1_10
 cgemvn_kernel_S4_END:
 	SAVE_S4
 	subs	I , I , #1
 	bne	cgemvn_kernel_S4X4
 cgemvn_kernel_S1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #3
 	ble	cgemvn_kernel_L999
 cgemvn_kernel_S1X1:
 	ldr	AO1, A
 	add	r3, AO1, #8
 	str	r3, A
 	ldr	XO , X
 	INIT_S1
 	mov	J, N
 cgemvn_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	J, J, #1
 	bne	cgemvn_kernel_S1X1_10
 cgemvn_kernel_S1_END:
 	SAVE_S1
 	subs	I , I , #1
 	bne	cgemvn_kernel_S1X1
 /*************************************************************************************************************/
 cgemvn_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@ -0,0 +1,607 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/29 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_N		r1
 #define M	r0
 #define AO1	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define N	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	512
 #define A_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if !defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fnmacs
        #define KMAC_I  fmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs
 #elif defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs
 #elif !defined(CONJ) && defined(XCONJ)
        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I2 fmacs
 #else
        #define KMAC_R  fnmacs
        #define KMAC_I  fmacs
        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I2 fmacs
 #endif
 .macro INIT_F2
 	vsub.f32	s12, s12, s12
 	vsub.f32	s13, s13, s13
 	vsub.f32	s14, s14, s14
 	vsub.f32	s15, s15, s15
 .endm
 .macro KERNEL_F2X4
 	KERNEL_F2X1
 	KERNEL_F2X1
 	KERNEL_F2X1
 	KERNEL_F2X1
 .endm
 .macro KERNEL_F2X1
 	fldmias	XO! ,  { s2 - s3 }
 	fldmias	AO1!,  { s4 - s5 }
 	fldmias	AO2!,  { s8 - s9   }
 	fmacs	s12 , s4 , s2
 	fmacs	s13 , s4 , s3
 	KMAC_R  s12 , s5 , s3
        KMAC_I  s13 , s5 , s2
 	fmacs	s14 , s8 , s2
 	fmacs	s15 , s8 , s3
        KMAC_R  s14 , s9 , s3
        KMAC_I  s15 , s9 , s2
 .endm
 .macro	SAVE_F2
 	fldmias	YO,  { s4 - s7 }
 	FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
        FMAC_R1 s6 , s0 , s14
        FMAC_I1 s7 , s0 , s15
        FMAC_R2 s6 , s1 , s15
        FMAC_I2 s7 , s1 , s14
 	fstmias	YO!, { s4 - s7 }
 .endm
 /************************************************************************************************/
 .macro INIT_F1
 	vsub.f32	s12, s12, s12
 	vsub.f32	s13, s13, s13
 .endm
 .macro KERNEL_F1X4
 	KERNEL_F1X1
 	KERNEL_F1X1
 	KERNEL_F1X1
 	KERNEL_F1X1
 .endm
 .macro KERNEL_F1X1
 	fldmias	XO! ,  { s2 - s3 }
 	fldmias	AO1!,  { s4 - s5 }
 	fmacs	s12 , s4 , s2
 	fmacs	s13 , s4 , s3
 	KMAC_R  s12 , s5 , s3
        KMAC_I  s13 , s5 , s2
 .endm
 .macro	SAVE_F1
 	fldmias	YO,  { s4 - s5 }
 	FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
 	fstmias	YO!, { s4 - s5 }
 .endm
 /************************************************************************************************/
 .macro INIT_S2
 	vsub.f32	s12, s12, s12
 	vsub.f32	s13, s13, s13
 	vsub.f32	s14, s14, s14
 	vsub.f32	s15, s15, s15
 .endm
 .macro KERNEL_S2X4
 	KERNEL_S2X1
 	KERNEL_S2X1
 	KERNEL_S2X1
 	KERNEL_S2X1
 .endm
 .macro KERNEL_S2X1
 	fldmias	XO  ,  { s2 - s3 }
 	fldmias	AO1!,  { s4 - s5 }
 	fldmias	AO2!,  { s8 - s9   }
 	fmacs	s12 , s4 , s2
 	fmacs	s13 , s4 , s3
 	KMAC_R  s12 , s5 , s3
        KMAC_I  s13 , s5 , s2
 	fmacs	s14 , s8 , s2
 	fmacs	s15 , s8 , s3
        KMAC_R  s14 , s9 , s3
        KMAC_I  s15 , s9 , s2
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S2
 	fldmias	YO,  { s4 - s5 }
 	FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
 	fstmias	YO,  { s4 - s5 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s6 - s7 }
        FMAC_R1 s6 , s0 , s14
        FMAC_I1 s7 , s0 , s15
        FMAC_R2 s6 , s1 , s15
        FMAC_I2 s7 , s1 , s14
 	fstmias	YO,  { s6 - s7 }
 	add	YO, YO, INC_Y
 .endm
 /************************************************************************************************/
 .macro INIT_S1
 	vsub.f32	s12, s12, s12
 	vsub.f32	s13, s13, s13
 .endm
 .macro KERNEL_S1X4
 	KERNEL_S1X1
 	KERNEL_S1X1
 	KERNEL_S1X1
 	KERNEL_S1X1
 .endm
 .macro KERNEL_S1X1
 	fldmias	XO  ,  { s2 - s3 }
 	fldmias	AO1!,  { s4 - s5 }
 	fmacs	s12 , s4 , s2
 	fmacs	s13 , s4 , s3
 	KMAC_R  s12 , s5 , s3
        KMAC_I  s13 , s5 , s2
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmias	YO,  { s4 - s5 }
 	FMAC_R1 s4 , s0 , s12
        FMAC_I1 s5 , s0 , s13
        FMAC_R2 s4 , s1 , s13
        FMAC_I2 s5 , s1 , s12
 	fstmias	YO,  { s4 - s5 }
 	add	YO, YO, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	M, #0
 	ble	cgemvt_kernel_L999
 	cmp	OLD_N, #0
 	ble	cgemvt_kernel_L999
 	str	OLD_A, A
 	str	OLD_N, N
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	cgemvt_kernel_L999
 	cmp	INC_Y, #0
 	beq	cgemvt_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #4				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	cgemvt_kernel_S2_BEGIN
 	cmp	INC_Y, #1
 	bne	cgemvt_kernel_S2_BEGIN
 cgemvt_kernel_F2_BEGIN:
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	cgemvt_kernel_F1_BEGIN
 cgemvt_kernel_F2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_F2
 	asrs	I, M, #2					// I = M / 4
 	ble	cgemvt_kernel_F2X1
 cgemvt_kernel_F2X4_10:
 	KERNEL_F2X4
 	subs	I, I, #1
 	bne	cgemvt_kernel_F2X4_10
 cgemvt_kernel_F2X1:
 	ands	I, M , #3
 	ble	cgemvt_kernel_F2_END
 cgemvt_kernel_F2X1_10:
 	KERNEL_F2X1
 	subs	I, I, #1
 	bne	cgemvt_kernel_F2X1_10
 cgemvt_kernel_F2_END:
 	SAVE_F2
 	subs	J , J , #1
 	bne	cgemvt_kernel_F2X4
 cgemvt_kernel_F1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	cgemvt_kernel_L999
 cgemvt_kernel_F1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_F1
 	asrs	I, M, #2					// I = M / 4
 	ble	cgemvt_kernel_F1X1
 cgemvt_kernel_F1X4_10:
 	KERNEL_F1X4
 	subs	I, I, #1
 	bne	cgemvt_kernel_F1X4_10
 cgemvt_kernel_F1X1:
 	ands	I, M , #3
 	ble	cgemvt_kernel_F1_END
 cgemvt_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	I, I, #1
 	bne	cgemvt_kernel_F1X1_10
 cgemvt_kernel_F1_END:
 	SAVE_F1
 	b	cgemvt_kernel_L999
 /*************************************************************************************************************/
 cgemvt_kernel_S2_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	cgemvt_kernel_S1_BEGIN
 cgemvt_kernel_S2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_S2
 	asrs	I, M, #2					// I = M / 4
 	ble	cgemvt_kernel_S2X1
 cgemvt_kernel_S2X4_10:
 	KERNEL_S2X4
 	subs	I, I, #1
 	bne	cgemvt_kernel_S2X4_10
 cgemvt_kernel_S2X1:
 	ands	I, M , #3
 	ble	cgemvt_kernel_S2_END
 cgemvt_kernel_S2X1_10:
 	KERNEL_S2X1
 	subs	I, I, #1
 	bne	cgemvt_kernel_S2X1_10
 cgemvt_kernel_S2_END:
 	SAVE_S2
 	subs	J , J , #1
 	bne	cgemvt_kernel_S2X4
 cgemvt_kernel_S1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	cgemvt_kernel_L999
 cgemvt_kernel_S1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_S1
 	asrs	I, M, #2					// I = M / 4
 	ble	cgemvt_kernel_S1X1
 cgemvt_kernel_S1X4_10:
 	KERNEL_S1X4
 	subs	I, I, #1
 	bne	cgemvt_kernel_S1X4_10
 cgemvt_kernel_S1X1:
 	ands	I, M , #3
 	ble	cgemvt_kernel_S1_END
 cgemvt_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	I, I, #1
 	bne	cgemvt_kernel_S1X1_10
 cgemvt_kernel_S1_END:
 	SAVE_S1
 /*************************************************************************************************************/
 cgemvt_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/copy.c
+++ b/kernel/arm/copy.c
@ -0,0 +1,59 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	if ( n < 0     )  return(0);
 	while(i < n)
 	{
 		y[iy] = x[ix] ;
 		ix += inc_x ;
 		iy += inc_y ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
--- a/kernel/arm/dcopy_vfp.S
+++ b/kernel/arm/dcopy_vfp.S
@ -0,0 +1,222 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY_F4
 	pld	[ X, #X_PRE  ]
 	fldmiad	X!, { d0 - d3 }
 	fstmiad	Y!, { d0 - d3 }
 .endm
 .macro COPY_F1
 	fldmiad	X!, { d0 }
 	fstmiad	Y!, { d0 }
 .endm
 /*************************************************************************************************************************/
 .macro COPY_S4
 	nop
 	fldmiad	X, { d0 }
 	fstmiad	Y, { d0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d1 }
 	fstmiad	Y, { d1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d0 }
 	fstmiad	Y, { d0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d1 }
 	fstmiad	Y, { d1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro COPY_S1
 	fldmiad	X, { d0 }
 	fstmiad	Y, { d0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	cmp	N, #0
 	ble	dcopy_kernel_L999
 	cmp	INC_X, #0
 	beq	dcopy_kernel_L999
 	cmp	INC_Y, #0
 	beq	dcopy_kernel_L999
 	cmp	INC_X, #1
 	bne	dcopy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	dcopy_kernel_S_BEGIN
 dcopy_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	dcopy_kernel_F1
 dcopy_kernel_F4:
 	COPY_F4
 	subs	I, I, #1
 	bne	dcopy_kernel_F4
 dcopy_kernel_F1:
 	ands	I, N, #3
 	ble	dcopy_kernel_L999
 dcopy_kernel_F10:
 	COPY_F1
 	subs    I, I, #1
        bne     dcopy_kernel_F10
 	b	dcopy_kernel_L999
 dcopy_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 	asrs	I, N, #2					// I = N / 4
 	ble	dcopy_kernel_S1
 dcopy_kernel_S4:
 	COPY_S4
 	subs	I, I, #1
 	bne	dcopy_kernel_S4
 dcopy_kernel_S1:
 	ands	I, N, #3
 	ble	dcopy_kernel_L999
 dcopy_kernel_S10:
 	COPY_S1
 	subs    I, I, #1
        bne     dcopy_kernel_S10
 dcopy_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@ -0,0 +1,248 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/11 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	fldmiad	X!, { d8 }
 	pld	[ Y, #X_PRE  ]
 	fldmiad	Y!, { d4 }
 	fldmiad	Y!, { d5 }
 	fmacd   d0  , d4,  d8
 	fldmiad	X!, { d9 }
 	fldmiad	Y!, { d6 }
 	fmacd   d1  , d5,  d9
 	fldmiad	X!, { d10 }
 	fldmiad	X!, { d11 }
 	fmacd   d0  , d6,  d10
 	fldmiad	Y!, { d7 }
 	fmacd   d1  , d7,  d11
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 }
 	fldmiad	Y!, { d8 }
 	fmacd   d0  , d4,  d8
 .endm
 /*************************************************************************************************************************/
 .macro KERNEL_S4
 	nop
 	fldmiad	X, { d4 }
 	fldmiad	Y, { d8 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacd   d0  , d4,  d8
 	fldmiad	X, { d5 }
 	fldmiad	Y, { d9 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacd   d1  , d5,  d9
 	fldmiad	X, { d6 }
 	fldmiad	Y, { d10 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacd   d0  , d6,  d10
 	fldmiad	X, { d7 }
 	fldmiad	Y, { d11 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacd   d1  , d7,  d11
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 }
 	fldmiad	Y, { d8 }
 	add	X, X, INC_X
 	fmacd   d0  , d4,  d8
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	vsub.f64                d0 , d0 , d0
 	vsub.f64                d1 , d1 , d1
 	cmp	N, #0
 	ble	ddot_kernel_L999
 	cmp	INC_X, #0
 	beq	ddot_kernel_L999
 	cmp	INC_Y, #0
 	beq	ddot_kernel_L999
 	cmp	INC_X, #1
 	bne	ddot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	ddot_kernel_S_BEGIN
 ddot_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	ddot_kernel_F1
 ddot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	ble	ddot_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	ddot_kernel_F4
 ddot_kernel_F1:
 	ands	I, N, #3
 	ble	ddot_kernel_L999
 ddot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     ddot_kernel_F10
 	b	ddot_kernel_L999
 ddot_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 	asrs	I, N, #2					// I = N / 4
 	ble	ddot_kernel_S1
 ddot_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	ddot_kernel_S4
 ddot_kernel_S1:
 	ands	I, N, #3
 	ble	ddot_kernel_L999
 ddot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     ddot_kernel_S10
 ddot_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	vadd.f64	d0 , d0, d1				// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@ -0,0 +1,806 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/27 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_K	r2
 #define	OLD_A	r3
 #define OLD_ALPHA d0
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDC	[fp, #-252 ]
 #define M	[fp, #-256 ]
 #define N	[fp, #-260 ]
 #define K	[fp, #-264 ]
 #define A	[fp, #-268 ]
 #define ALPHA	[fp, #-280]
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define I	r0
 #define J	r1
 #define L	r2
 #define	AO	r5
 #define	BO	r6
 #define	CO1	r8
 #define	CO2	r9
 #define K1	r7
 #define BC	r12
 #define A_PRE	96
 #define B_PRE	96
 #define C_PRE	32
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro INIT4x2
 	vsub.f64		d8 , d8 , d8
 	vmov.f64		d9, d8
 	vmov.f64		d10, d8
 	vmov.f64		d11, d8
 	vmov.f64		d12, d8
 	vmov.f64		d13, d8
 	vmov.f64		d14, d8
 	vmov.f64		d15, d8
 .endm
 .macro KERNEL4x2_SUB
 	pld	[ AO, #A_PRE ]
 	fldd	d4 , [ BO ]
 	fldd	d0 , [ AO ]
 	fldd	d1 , [ AO, #8 ]
 	fmacd	d8  , d0,  d4
 	fldd	d2 , [ AO, #16 ]
 	fmacd	d9  , d1,  d4
 	fldd	d3 , [ AO, #24 ]
 	fmacd	d10  , d2,  d4
 	fldd	d5 , [ BO, #8 ]
 	fmacd	d11  , d3,  d4
 	fmacd	d12  , d0,  d5
 	fmacd	d13  , d1,  d5
 	add	AO , AO, #32
 	fmacd	d14  , d2,  d5
 	add	BO , BO, #16
 	fmacd	d15  , d3,  d5
 .endm
 .macro SAVE4x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fldd	d5 , [CO1, #8 ]
 	pld	[ CO1, #C_PRE ]
 	fmacd	d4 , d0 , d8
 	fldd	d6 , [CO1, #16 ]
 	fmacd	d5 , d0 , d9
 	fldd	d7 , [CO1, #24 ]
 	fmacd	d6 , d0 , d10
 	fstd	d4 , [CO1]
 	fmacd	d7 , d0 , d11
 	fstd	d5 , [CO1, #8 ]
 	fstd	d6 , [CO1, #16 ]
 	fstd	d7 , [CO1, #24 ]
 	fldd	d4 , [CO2]
 	fldd	d5 , [CO2, #8 ]
 	pld	[ CO2, #C_PRE ]
 	fmacd	d4 , d0 , d12
 	fldd	d6 , [CO2, #16 ]
 	fmacd	d5 , d0 , d13
 	fldd	d7 , [CO2, #24 ]
 	fmacd	d6 , d0 , d14
 	fstd	d4 , [CO2]
 	fmacd	d7 , d0 , d15
 	add	CO1, CO1, #32
 	fstd	d5 , [CO2, #8 ]
 	fstd	d6 , [CO2, #16 ]
 	fstd	d7 , [CO2, #24 ]
 .endm
 /******************************************************************************/
 .macro INIT2x2
 	vsub.f64		d8 , d8 , d8
 	vmov.f64		d9, d8
 	vmov.f64		d12, d8
 	vmov.f64		d13, d8
 .endm
 .macro KERNEL2x2_SUB
 	fldd	d4 , [ BO ]
 	fldd	d5 , [ BO, #8 ]
 	fldd	d0 , [ AO ]
 	fldd	d1 , [ AO, #8 ]
 	fmacd	d8  , d0,  d4
 	fmacd	d9  , d1,  d4
 	fmacd	d12  , d0,  d5
 	fmacd	d13  , d1,  d5
 	add	AO , AO, #16
 	add	BO , BO, #16
 .endm
 .macro SAVE2x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fldd	d5 , [CO1, #8 ]
 	fmacd	d4 , d0 , d8
 	fmacd	d5 , d0 , d9
 	fstd	d4 , [CO1]
 	fstd	d5 , [CO1, #8 ]
 	fldd	d4 , [CO2]
 	fldd	d5 , [CO2, #8 ]
 	fmacd	d4 , d0 , d12
 	fmacd	d5 , d0 , d13
 	fstd	d4 , [CO2]
 	fstd	d5 , [CO2, #8 ]
 	add	CO1, CO1, #16
 .endm
 /******************************************************************************/
 .macro INIT1x2
 	vsub.f64		d8 , d8 , d8
 	vmov.f64		d12, d8
 .endm
 .macro KERNEL1x2_SUB
 	fldd	d4 , [ BO ]
 	fldd	d5 , [ BO, #8 ]
 	fldd	d0 , [ AO ]
 	fmacd	d8  , d0,  d4
 	fmacd	d12  , d0,  d5
 	add	AO , AO, #8
 	add	BO , BO, #16
 .endm
 .macro SAVE1x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fmacd	d4 , d0 , d8
 	fstd	d4 , [CO1]
 	fldd	d4 , [CO2]
 	fmacd	d4 , d0 , d12
 	fstd	d4 , [CO2]
 	add	CO1, CO1, #8
 .endm
 /******************************************************************************/
 .macro INIT4x1
 	vsub.f64		d8 , d8 , d8
 	vmov.f64		d9, d8
 	vmov.f64		d10, d8
 	vmov.f64		d11, d8
 .endm
 .macro KERNEL4x1_SUB
 	fldd	d4 , [ BO ]
 	fldd	d0 , [ AO ]
 	fldd	d1 , [ AO, #8 ]
 	fldd	d2 , [ AO, #16 ]
 	fldd	d3 , [ AO, #24 ]
 	fmacd	d8  , d0,  d4
 	fmacd	d9  , d1,  d4
 	fmacd	d10 , d2,  d4
 	fmacd	d11 , d3,  d4
 	add	AO , AO, #32
 	add	BO , BO, #8
 .endm
 .macro SAVE4x1
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fldd	d5 , [CO1, #8 ]
 	fldd	d6 , [CO1, #16 ]
 	fldd	d7 , [CO1, #24 ]
 	fmacd	d4 , d0 , d8
 	fmacd	d5 , d0 , d9
 	fmacd	d6 , d0 , d10
 	fmacd	d7 , d0 , d11
 	fstd	d4 , [CO1]
 	fstd	d5 , [CO1, #8 ]
 	fstd	d6 , [CO1, #16 ]
 	fstd	d7 , [CO1, #24 ]
 	add	CO1, CO1, #32
 .endm
 /******************************************************************************/
 .macro INIT2x1
 	vsub.f64		d8 , d8 , d8
 	vmov.f64		d9 , d8
 .endm
 .macro KERNEL2x1_SUB
 	fldd	d4 , [ BO ]
 	fldd	d0 , [ AO ]
 	fldd	d1 , [ AO, #8 ]
 	fmacd	d8  , d0,  d4
 	fmacd	d9  , d1,  d4
 	add	AO , AO, #16
 	add	BO , BO, #8
 .endm
 .macro SAVE2x1
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fldd	d5 , [CO1, #8 ]
 	fmacd	d4 , d0 , d8
 	fmacd	d5 , d0 , d9
 	fstd	d4 , [CO1]
 	fstd	d5 , [CO1, #8 ]
 	add	CO1, CO1, #16
 .endm
 /******************************************************************************/
 .macro INIT1x1
 	vsub.f64		d8 , d8 , d8
 .endm
 .macro KERNEL1x1_SUB
 	fldd	d4 , [ BO ]
 	fldd	d0 , [ AO ]
 	fmacd	d8  , d0,  d4
 	add	AO , AO, #8
 	add	BO , BO, #8
 .endm
 .macro SAVE1x1
 	fldd		d0, ALPHA
 	fldd	d4 , [CO1]
 	fmacd	d4 , d0 , d8
 	fstd	d4 , [CO1]
 	add	CO1, CO1, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
 	str	OLD_A, A
 	vstr	OLD_ALPHA, ALPHA
 	sub	r3, fp, #128
 	vstm	r3, { d8 - d15} 				// store floating point registers
 	ldr	r3, OLD_LDC
 	lsl	r3, r3, #3					// ldc = ldc * 8
 	str	r3, LDC
 	ldr	K1, K
 	ldr	BC, B
 	ldr	J, N
 	asrs	J, J, #1					// J = J / 2
 	ble	dgemm_kernel_L1_BEGIN
 /*********************************************************************************************/
 dgemm_kernel_L2_BEGIN:
 	ldr	CO1, C						// CO1 = C
 	ldr	r4 , LDC
 	lsl	r4 , r4 , #1					// LDC * 2
 	add	r3 , r4, CO1
 	str	r3 , C						// store C
 	ldr	AO, A						// AO = A
 dgemm_kernel_L2_M4_BEGIN:
 	ldr	I, M
 	asrs	I, I, #2					// I = I / 4
 	ble	dgemm_kernel_L2_M2_BEGIN
 dgemm_kernel_L2_M4_20:
 	INIT4x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L2_M4_40
 	.align 5
 dgemm_kernel_L2_M4_22:
 	pld	[ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld	[ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld	[ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld	[ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M4_22
 dgemm_kernel_L2_M4_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L2_M4_100
 dgemm_kernel_L2_M4_42:
 	KERNEL4x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M4_42
 dgemm_kernel_L2_M4_100:
 	SAVE4x2
 dgemm_kernel_L2_M4_END:
 	subs	I, I, #1
 	bgt	dgemm_kernel_L2_M4_20
 dgemm_kernel_L2_M2_BEGIN:
 	ldr	I, M
 	tst	I , #3
 	ble	dgemm_kernel_L2_END
 	tst	I, #2					// I = I / 2
 	ble	dgemm_kernel_L2_M1_BEGIN
 dgemm_kernel_L2_M2_20:
 	INIT2x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L2_M2_40
 dgemm_kernel_L2_M2_22:
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M2_22
 dgemm_kernel_L2_M2_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L2_M2_100
 dgemm_kernel_L2_M2_42:
 	KERNEL2x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M2_42
 dgemm_kernel_L2_M2_100:
 	SAVE2x2
 dgemm_kernel_L2_M2_END:
 dgemm_kernel_L2_M1_BEGIN:
 	tst	I, #1					// I = I % 2
 	ble	dgemm_kernel_L2_END
 dgemm_kernel_L2_M1_20:
 	INIT1x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L2_M1_40
 dgemm_kernel_L2_M1_22:
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M1_22
 dgemm_kernel_L2_M1_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L2_M1_100
 dgemm_kernel_L2_M1_42:
 	KERNEL1x2_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L2_M1_42
 dgemm_kernel_L2_M1_100:
 	SAVE1x2
 dgemm_kernel_L2_END:
 	mov	r3, BC
 	mov	r4, K1
 	lsl	r4, r4, #4					// k * 2 * 8
 	add	r3, r3, r4					// B = B + K * 2 * 8
 	mov	BC, r3
 	subs	J , #1						// j--
 	bgt	dgemm_kernel_L2_BEGIN
 /*********************************************************************************************/
 dgemm_kernel_L1_BEGIN:
 	ldr	J , N
 	tst	J , #1
 	ble	dgemm_kernel_L999
 	ldr	CO1, C						// CO1 = C
 	ldr	r4 , LDC
 	add	r3 , r4, CO1
 	str	r3 , C						// store C
 	ldr	AO, A						// AO = A
 dgemm_kernel_L1_M4_BEGIN:
 	ldr	I, M
 	asrs	I, I, #2					// I = I / 4
 	ble	dgemm_kernel_L1_M2_BEGIN
 dgemm_kernel_L1_M4_20:
 	INIT4x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L1_M4_40
 	.align 5
 dgemm_kernel_L1_M4_22:
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M4_22
 dgemm_kernel_L1_M4_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L1_M4_100
 dgemm_kernel_L1_M4_42:
 	KERNEL4x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M4_42
 dgemm_kernel_L1_M4_100:
 	SAVE4x1
 dgemm_kernel_L1_M4_END:
 	subs	I, I, #1
 	bgt	dgemm_kernel_L1_M4_20
 dgemm_kernel_L1_M2_BEGIN:
 	ldr	I, M
 	tst	I , #3
 	ble	dgemm_kernel_L1_END
 	tst	I, #2					// I = I / 2
 	ble	dgemm_kernel_L1_M1_BEGIN
 dgemm_kernel_L1_M2_20:
 	INIT2x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L1_M2_40
 dgemm_kernel_L1_M2_22:
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M2_22
 dgemm_kernel_L1_M2_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L1_M2_100
 dgemm_kernel_L1_M2_42:
 	KERNEL2x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M2_42
 dgemm_kernel_L1_M2_100:
 	SAVE2x1
 dgemm_kernel_L1_M2_END:
 dgemm_kernel_L1_M1_BEGIN:
 	tst	I, #1					// I = I % 2
 	ble	dgemm_kernel_L1_END
 dgemm_kernel_L1_M1_20:
 	INIT1x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	dgemm_kernel_L1_M1_40
 dgemm_kernel_L1_M1_22:
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M1_22
 dgemm_kernel_L1_M1_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	dgemm_kernel_L1_M1_100
 dgemm_kernel_L1_M1_42:
 	KERNEL1x1_SUB
 	subs	L, L, #1
 	bgt	dgemm_kernel_L1_M1_42
 dgemm_kernel_L1_M1_100:
 	SAVE1x1
 dgemm_kernel_L1_END:
 dgemm_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
--- a/kernel/arm/dgemm_ncopy_2_vfp.S
+++ b/kernel/arm/dgemm_ncopy_2_vfp.S
@ -0,0 +1,225 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/24 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define	LDA	r8
 #define I	r3
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d2 , [ AO1, #8  ]
 	fldd	d1 , [ AO2, #0  ]
 	fldd	d3 , [ AO2, #8  ]
 	add	AO1, AO1, #16
 	fstmiad	BO!, { d0 - d3 }
 	add	AO2, AO2, #16
 .endm
 .macro COPY1x2
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO2, #0  ]
 	add	AO1, AO1, #8
 	fstmiad	BO!, { d0 - d1 }
 	add	AO2, AO2, #8
 .endm
 .macro COPY2x1
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fstmiad	BO!, { d0 - d1 }
 	add	AO1, AO1, #16
 .endm
 .macro COPY1x1
 	fldd	d0 , [ AO1, #0  ]
 	fstmiad	BO!, { d0 }
 	add	AO1, AO1, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	lsl	LDA, OLD_LDA, #3					// lda = lda * 8
 	ldr	BO, B	
 /*********************************************************************************************/
 dgemm_ncopy_L2_BEGIN:
 	asrs	J, N, #1					// J = N / 2
 	ble	dgemm_ncopy_L1_BEGIN
 dgemm_ncopy_L2_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	add	AO2, AO1, LDA
 	add	A  , AO2, LDA 					// A = A + 2 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	dgemm_ncopy_L2_M2_40
 dgemm_ncopy_L2_M2_20:
 	COPY2x2
 	subs	I , I , #1
 	bne	dgemm_ncopy_L2_M2_20
 dgemm_ncopy_L2_M2_40:
 	ands	I, M , #1
 	ble	dgemm_ncopy_L2_M2_END
 dgemm_ncopy_L2_M2_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	dgemm_ncopy_L2_M2_60
 dgemm_ncopy_L2_M2_END:
 	subs	J , J, #1						// j--
 	bne	dgemm_ncopy_L2_M2_BEGIN
 /*********************************************************************************************/
 dgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	dgemm_ncopy_L999
 dgemm_ncopy_L1_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	add	A  , AO1, LDA 					// A = A + 1 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	dgemm_ncopy_L1_M2_40
 dgemm_ncopy_L1_M2_20:
 	COPY2x1
 	subs	I , I , #1
 	bne	dgemm_ncopy_L1_M2_20
 dgemm_ncopy_L1_M2_40:
 	ands	I, M , #1
 	ble	dgemm_ncopy_L1_M2_END
 dgemm_ncopy_L1_M2_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	dgemm_ncopy_L1_M2_60
 dgemm_ncopy_L1_M2_END:
 dgemm_ncopy_L999:
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/dgemm_ncopy_4_vfp.S
+++ b/kernel/arm/dgemm_ncopy_4_vfp.S
@ -0,0 +1,349 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/05 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDA	[fp, #-260 ]
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define	AO3	r8
 #define	AO4	r9
 #define I	r3
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY4x4
 	pld	[ AO1, #A_PRE  ]
 	pld	[ AO2, #A_PRE  ]
 	pld	[ AO3, #A_PRE  ]
 	pld	[ AO4, #A_PRE  ]
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO2, #0  ]
 	fldd	d2 , [ AO3, #0  ]
 	fldd	d3 , [ AO4, #0  ]
 	fldd	d4 , [ AO1, #8  ]
 	fldd	d8 , [ AO1, #16 ]
 	fldd	d12, [ AO1, #24 ]
 	fldd	d5 , [ AO2, #8  ]
 	add	AO1, AO1, #32
 	fldd	d9 , [ AO2, #16 ]
 	fldd	d13, [ AO2, #24 ]
 	fldd	d6 , [ AO3, #8  ]
 	add	AO2, AO2, #32
 	fldd	d10, [ AO3, #16 ]
 	fldd	d14, [ AO3, #24 ]
 	fldd	d7 , [ AO4, #8  ]
 	add	AO3, AO3, #32
 	fldd	d11, [ AO4, #16 ]
 	fldd	d15, [ AO4, #24 ]
 	fstmiad	BO!, { d0 - d3 }
 	add	AO4, AO4, #32
 	fstmiad	BO!, { d4 - d7 }
 	fstmiad	BO!, { d8 - d15 }
 .endm
 .macro COPY1x4
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO2, #0  ]
 	add	AO1, AO1, #8
 	fldd	d2 , [ AO3, #0  ]
 	add	AO2, AO2, #8
 	fldd	d3 , [ AO4, #0  ]
 	add	AO3, AO3, #8
 	fstmiad	BO!, { d0 - d3 }
 	add	AO4, AO4, #8
 .endm
 .macro COPY4x2
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d2 , [ AO1, #8  ]
 	fldd	d4 , [ AO1, #16 ]
 	fldd	d6 , [ AO1, #24 ]
 	fldd	d1 , [ AO2, #0  ]
 	fldd	d3 , [ AO2, #8  ]
 	add	AO1, AO1, #32
 	fldd	d5 , [ AO2, #16 ]
 	fldd	d7 , [ AO2, #24 ]
 	fstmiad	BO!, { d0 - d7 }
 	add	AO2, AO2, #32
 .endm
 .macro COPY1x2
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO2, #0  ]
 	add	AO1, AO1, #8
 	fstmiad	BO!, { d0 - d1 }
 	add	AO2, AO2, #8
 .endm
 .macro COPY4x1
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fldd	d2 , [ AO1, #16 ]
 	fldd	d3 , [ AO1, #24 ]
 	fstmiad	BO!, { d0 - d3 }
 	add	AO1, AO1, #32
 .endm
 .macro COPY1x1
 	fldd	d0 , [ AO1, #0  ]
 	fstmiad	BO!, { d0 }
 	add	AO1, AO1, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	lsl	r3, r3, #3					// lda = lda * 8
 	str	r3, LDA
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	ldr	BO, B	
 dgemm_ncopy_L4_BEGIN:
 	asrs	J, N, #2					// J = N / 4
 	ble	dgemm_ncopy_L2_BEGIN
 dgemm_ncopy_L4_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	AO3, AO2, r4
 	add	AO4, AO3, r4
 	add	A  , AO4, r4					// A = A + 4 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	dgemm_ncopy_L4_M4_40
 dgemm_ncopy_L4_M4_20:
 	COPY4x4
 	subs	I , I , #1
 	bne	dgemm_ncopy_L4_M4_20
 dgemm_ncopy_L4_M4_40:
 	ands	I, M , #3
 	ble	dgemm_ncopy_L4_M4_END
 dgemm_ncopy_L4_M4_60:
 	COPY1x4
 	subs	I , I , #1
 	bne	dgemm_ncopy_L4_M4_60
 dgemm_ncopy_L4_M4_END:
 	subs	J , J, #1						// j--
 	bne	dgemm_ncopy_L4_M4_BEGIN
 /*********************************************************************************************/
 dgemm_ncopy_L2_BEGIN:
 	tst	N, #3
 	ble	dgemm_ncopy_L999
 	tst	N, #2
 	ble	dgemm_ncopy_L1_BEGIN
 dgemm_ncopy_L2_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	A  , AO2, r4 					// A = A + 2 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	dgemm_ncopy_L2_M4_40
 dgemm_ncopy_L2_M4_20:
 	COPY4x2
 	subs	I , I , #1
 	bne	dgemm_ncopy_L2_M4_20
 dgemm_ncopy_L2_M4_40:
 	ands	I, M , #3
 	ble	dgemm_ncopy_L2_M4_END
 dgemm_ncopy_L2_M4_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	dgemm_ncopy_L2_M4_60
 dgemm_ncopy_L2_M4_END:
 /*********************************************************************************************/
 dgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	dgemm_ncopy_L999
 dgemm_ncopy_L1_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	A  , AO1, r4 					// A = A + 1 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	dgemm_ncopy_L1_M4_40
 dgemm_ncopy_L1_M4_20:
 	COPY4x1
 	subs	I , I , #1
 	bne	dgemm_ncopy_L1_M4_20
 dgemm_ncopy_L1_M4_40:
 	ands	I, M , #3
 	ble	dgemm_ncopy_L1_M4_END
 dgemm_ncopy_L1_M4_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	dgemm_ncopy_L1_M4_60
 dgemm_ncopy_L1_M4_END:
 dgemm_ncopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/dgemm_tcopy_4_vfp.S
+++ b/kernel/arm/dgemm_tcopy_4_vfp.S
@ -0,0 +1,408 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/06 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define B	[fp, #4 ]
 #define A	[fp, #-248 ]
 #define M	r0
 #define N	r1
 #define M4	r2
 #define	LDA	r5
 #define	AO1	r6
 #define	BO1	r7
 #define	BO2	r8
 #define	BO3	r9
 #define I	r4
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY4x4
 	pld	[ AO1, #A_PRE  ]
 	fldmiad	AO1, { d0 - d3 }
 	add	r3, AO1, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmiad	r3, { d4 - d7 }
 	add	r3, r3, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmiad	r3, { d8 - d11 }
 	add	r3, r3, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmiad	r3, { d12 - d15 }
 	fstmiad	BO1, { d0 - d15 }
 	add	AO1, AO1, #32
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x4
 	fldmiad	AO1, { d0 - d1 }
 	add	r3, AO1, LDA
 	fldmiad	r3, { d2 - d3 }
 	add	r3, r3, LDA
 	fldmiad	r3, { d4 - d5 }
 	add	r3, r3, LDA
 	fldmiad	r3, { d6 - d7 }
 	fstmiad	BO2, { d0 - d7 }
 	add	AO1, AO1, #16
 	add	BO2, BO2, #64
 .endm
 .macro COPY1x4
 	fldmiad	AO1, { d0 }
 	add	r3, AO1, LDA
 	fldmiad	r3, { d1 }
 	add	r3, r3, LDA
 	fldmiad	r3, { d2 }
 	add	r3, r3, LDA
 	fldmiad	r3, { d3 }
 	fstmiad	BO3, { d0 - d3 }
 	add	AO1, AO1, #8
 	add	BO3, BO3, #32
 .endm
 /*************************************************************************************************************************/
 .macro COPY4x2
 	pld	[ AO1, #A_PRE  ]
 	fldmiad	AO1, { d0 - d3 }
 	add	r3, AO1, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmiad	r3, { d4 - d7 }
 	fstmiad	BO1, { d0 - d7 }
 	add	AO1, AO1, #32
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x2
 	fldmiad	AO1, { d0 - d1 }
 	add	r3, AO1, LDA
 	fldmiad	r3, { d2 - d3 }
 	fstmiad	BO2, { d0 - d3 }
 	add	AO1, AO1, #16
 	add	BO2, BO2, #32
 .endm
 .macro COPY1x2
 	fldmiad	AO1, { d0 }
 	add	r3, AO1, LDA
 	fldmiad	r3, { d1 }
 	fstmiad	BO3, { d0 - d1 }
 	add	AO1, AO1, #8
 	add	BO3, BO3, #16
 .endm
 /*************************************************************************************************************************/
 .macro COPY4x1
 	pld	[ AO1, #A_PRE  ]
 	fldmiad	AO1, { d0 - d3 }
 	fstmiad	BO1, { d0 - d3 }
 	add	AO1, AO1, #32
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x1
 	fldmiad	AO1, { d0 - d1 }
 	fstmiad	BO2, { d0 - d1 }
 	add	AO1, AO1, #16
 	add	BO2, BO2, #16
 .endm
 .macro COPY1x1
 	fldmiad	AO1, { d0 }
 	fstmiad	BO3, { d0 }
 	add	AO1, AO1, #8
 	add	BO3, BO3, #8
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_A, A					// store A
 	lsl	LDA, OLD_LDA, #3				// lda = lda * SIZE
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	lsl	r4 , M, #3					// M * SIZE
 	ldr	r3, B
 	and	BO2 , N , #-4
 	and	BO3 , N , #-2
 	mul	BO2, BO2, r4
 	mul	BO3, BO3, r4
 	add	BO2 , BO2, r3
 	add	BO3 , BO3, r3
 	lsl	M4, M, #5					// M4 = M * 4 * SIZE
 dgemm_tcopy_L4_BEGIN:
 	asrs	J, M, #2					// J = N / 4
 	ble	dgemm_tcopy_L2_BEGIN
 dgemm_tcopy_L4_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #2					// r3 = 4 * LDA
 	add	r3, r3 , AO1					// A = A + 4 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #128					// B = B + 16 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	dgemm_tcopy_L4_M4_40
 dgemm_tcopy_L4_M4_20:
 	COPY4x4
 	subs	I , I , #1
 	bne	dgemm_tcopy_L4_M4_20
 dgemm_tcopy_L4_M4_40:
 	tst	N , #2
 	ble	dgemm_tcopy_L4_M4_60
 	COPY2x4
 dgemm_tcopy_L4_M4_60:
 	tst	N, #1
 	ble	dgemm_tcopy_L4_M4_END
 	COPY1x4
 dgemm_tcopy_L4_M4_END:
 	subs	J , J, #1						// j--
 	bne	dgemm_tcopy_L4_M4_BEGIN
 /*********************************************************************************************/
 dgemm_tcopy_L2_BEGIN:
 	tst	M, #3
 	ble	dgemm_tcopy_L999
 	tst	M, #2
 	ble	dgemm_tcopy_L1_BEGIN
 dgemm_tcopy_L2_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #1					// r3 = 2 * LDA
 	add	r3, r3 , AO1					// A = A + 2 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #64					// B = B + 8 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	dgemm_tcopy_L2_M4_40
 dgemm_tcopy_L2_M4_20:
 	COPY4x2
 	subs	I , I , #1
 	bne	dgemm_tcopy_L2_M4_20
 dgemm_tcopy_L2_M4_40:
 	tst	N , #2
 	ble	dgemm_tcopy_L2_M4_60
 	COPY2x2
 dgemm_tcopy_L2_M4_60:
 	tst	N , #1
 	ble	dgemm_tcopy_L2_M4_END
 	COPY1x2
 dgemm_tcopy_L2_M4_END:
 /*********************************************************************************************/
 dgemm_tcopy_L1_BEGIN:
 	tst	M, #1
 	ble	dgemm_tcopy_L999
 dgemm_tcopy_L1_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	add	r3, LDA , AO1					// A = A + 1 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #32					// B = B + 4 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	dgemm_tcopy_L1_M4_40
 dgemm_tcopy_L1_M4_20:
 	COPY4x1
 	subs	I , I , #1
 	bne	dgemm_tcopy_L1_M4_20
 dgemm_tcopy_L1_M4_40:
 	tst	N , #2
 	ble	dgemm_tcopy_L1_M4_60
 	COPY2x1
 dgemm_tcopy_L1_M4_60:
 	tst	N , #1
 	ble	dgemm_tcopy_L1_M4_END
 	COPY1x1
 dgemm_tcopy_L1_M4_END:
 dgemm_tcopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/dot.c
+++ b/kernel/arm/dot.c
@ -0,0 +1,64 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #if defined(DSDOT)
 double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #else
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #endif
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	double dot = 0.0 ;
 	if ( n < 0 )  return(dot);
 	while(i < n)
 	{
 		dot += y[iy] * x[ix] ;
 		ix  += inc_x ;
 		iy  += inc_y ;
 		i++ ;
 	}
 	return(dot);
 }
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
--- a/kernel/arm/gemv_n.c
+++ b/kernel/arm/gemv_n.c
@ -0,0 +1,67 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * * 2013/09/14 Saar
 * *	 BLASTEST float		: OK
 * * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 * *
 * **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG ix,iy;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT temp;
 	ix = 0;
 	a_ptr = a;
 	for (j=0; j<n; j++)
 	{
 		temp = alpha * x[ix];
 		iy = 0;
 		for (i=0; i<m; i++)
 		{
 			y[iy] += temp * a_ptr[i];
 			iy += inc_y;
 		}
 		a_ptr += lda;
 		ix    += inc_x;
 	}
 }
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@ -0,0 +1,740 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/28 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_M		r0
 #define AO1	r0
 #define N	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define M	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	64
 #define Y_PRE	0
 #define A_PRE	0
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	defined(DOUBLE)
 .macro INIT_F8
 	pld     [ YO , #Y_PRE ]
 	pld     [ YO , #Y_PRE+32 ]
 	vsub.f64	d8 , d8 , d8
 	vmov.f64	d9  , d8
 	vmov.f64	d10 , d8
 	vmov.f64	d11 , d8
 	vmov.f64	d12 , d8
 	vmov.f64	d13 , d8
 	vmov.f64	d14 , d8
 	vmov.f64	d15 , d8
 .endm
 .macro KERNEL_F8X8
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 .endm
 .macro KERNEL_F8X1
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	XO! ,  { d2 }
 	fldmiad	AO1 ,  { d4 - d7 }
 	vmla.f64	d8  , d2 , d4
 	pld	[ AO2 , #4*SIZE ]
 	vmla.f64	d9  , d2 , d5
 	add	r3, AO1, #4*SIZE
 	vmla.f64	d10 , d2 , d6
 	vmla.f64	d11 , d2 , d7
 	fldmiad	r3 ,  { d4 - d7 }
 	vmla.f64	d12 , d2 , d4
 	vmla.f64	d13 , d2 , d5
 	add		AO1, AO1, LDA
 	vmla.f64	d14 , d2 , d6
 	add		AO2, AO2, LDA
 	vmla.f64	d15 , d2 , d7
 .endm
 .macro	SAVE_F8
 	fldmiad	YO,  { d4 - d7 }
 	vmla.f64	d4 , d0, d8
 	vmla.f64	d5 , d0, d9
 	vmla.f64	d6 , d0, d10
 	vmla.f64	d7 , d0, d11
 	fstmiad	YO!, { d4 - d7 }
 	fldmiad	YO,  { d4 - d7 }
 	vmla.f64	d4 , d0, d12
 	vmla.f64	d5 , d0, d13
 	vmla.f64	d6 , d0, d14
 	vmla.f64	d7 , d0, d15
 	fstmiad	YO!, { d4 - d7 }
 .endm
 .macro INIT_F1
 	vsub.f64	d12 , d12 , d12
 .endm
 .macro KERNEL_F1X1
 	fldmiad	XO! ,  { d2 }
 	fldmiad	AO1 ,  { d8 }
 	vmla.f64	d12 , d2 , d8
 	add		AO1, AO1, LDA
 .endm
 .macro	SAVE_F1
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4, d0, d12
 	fstmiad	YO!, { d4 }
 .endm
 /*********************************************************************************************/
 .macro INIT_S4
 	vsub.f64	d12 , d12 , d12
 	vmov.f64	d13 , d12
 	vmov.f64	d14 , d12
 	vmov.f64	d15 , d12
 .endm
 .macro KERNEL_S4X4
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 .endm
 .macro KERNEL_S4X1
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	XO  ,  { d2 }
 	fldmiad	AO1 ,  { d8 - d11 }
 	vmla.f64	d12 , d2 , d8
 	add		AO1, AO1, LDA
 	vmla.f64	d13 , d2 , d9
 	add		AO2, AO2, LDA
 	vmla.f64	d14 , d2 , d10
 	vmla.f64	d15 , d2 , d11
 	add		XO, XO , INC_X
 .endm
 .macro	SAVE_S4
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4 , d0, d12
 	fstmiad	YO,  { d4 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d5 }
 	vmla.f64	d5 , d0, d13
 	fstmiad	YO,  { d5 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4 , d0, d14
 	fstmiad	YO,  { d4 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d5 }
 	vmla.f64	d5 , d0, d15
 	fstmiad	YO,  { d5 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f64	d12 , d12 , d12
 .endm
 .macro KERNEL_S1X1
 	fldmiad	XO  ,  { d2 }
 	fldmiad	AO1 ,  { d8 }
 	vmla.f64	d12 , d2 , d8
 	add		AO1, AO1, LDA
 	add		XO, XO , INC_X
 .endm
 .macro	SAVE_S1
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4, d0, d12
 	fstmiad	YO , { d4 }
 	add	YO, YO, INC_Y
 .endm
 #else	/************************* SINGLE PRECISION *****************************************/
 .macro INIT_F8
 	pld     [ YO , #Y_PRE ]
 	vsub.f32	s8 , s8 , s8
 	vmov.f32	s9  , s8
 	vmov.f32	s10 , s8
 	vmov.f32	s11 , s8
 	vmov.f32	s12 , s8
 	vmov.f32	s13 , s8
 	vmov.f32	s14 , s8
 	vmov.f32	s15 , s8
 .endm
 .macro KERNEL_F8X8
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 .endm
 .macro KERNEL_F8X1
 	pld	[ AO2, #A_PRE ]
 	fldmias	XO! ,  { s2 }
 	fldmias	AO1 ,  { s4 - s7 }
 	vmla.f32	s8  , s2 , s4
 	vmla.f32	s9  , s2 , s5
 	vmla.f32	s10 , s2 , s6
 	vmla.f32	s11 , s2 , s7
 	add	r3, AO1, #4*SIZE
 	fldmias	r3 ,  { s4 - s7 }
 	vmla.f32	s12 , s2 , s4
 	vmla.f32	s13 , s2 , s5
 	vmla.f32	s14 , s2 , s6
 	vmla.f32	s15 , s2 , s7
 	add		AO1, AO1, LDA
 	add		AO2, AO2, LDA
 .endm
 .macro	SAVE_F8
 	fldmias	YO,  { s4 - s7 }
 	vmla.f32	s4 , s0, s8
 	vmla.f32	s5 , s0, s9
 	vmla.f32	s6 , s0, s10
 	vmla.f32	s7 , s0, s11
 	fstmias	YO!, { s4 - s7 }
 	fldmias	YO,  { s4 - s7 }
 	vmla.f32	s4 , s0, s12
 	vmla.f32	s5 , s0, s13
 	vmla.f32	s6 , s0, s14
 	vmla.f32	s7 , s0, s15
 	fstmias	YO!, { s4 - s7 }
 .endm
 .macro INIT_F1
 	vsub.f32	s12 , s12 , s12
 .endm
 .macro KERNEL_F1X1
 	fldmias	XO! ,  { s2 }
 	fldmias	AO1 ,  { s8 }
 	vmla.f32	s12 , s2 , s8
 	add		AO1, AO1, LDA
 .endm
 .macro	SAVE_F1
 	fldmias	YO,  { s4 }
 	vmla.f32	s4, s0, s12
 	fstmias	YO!, { s4 }
 .endm
 /*********************************************************************************************/
 .macro INIT_S4
 	vsub.f32	s12 , s12 , s12
 	vmov.f32	s13 , s12
 	vmov.f32	s14 , s12
 	vmov.f32	s15 , s12
 .endm
 .macro KERNEL_S4X4
 	pld	[ AO2 , #A_PRE ]
 	KERNEL_S4X1
 	KERNEL_S4X1
 	pld	[ AO2 , #A_PRE ]
 	KERNEL_S4X1
 	KERNEL_S4X1
 .endm
 .macro KERNEL_S4X1
 	fldmias	XO  ,  { s2 }
 	fldmias	AO1 ,  { s8 - s11 }
 	vmla.f32	s12 , s2 , s8
 	vmla.f32	s13 , s2 , s9
 	vmla.f32	s14 , s2 , s10
 	vmla.f32	s15 , s2 , s11
 	add		AO1, AO1, LDA
 	add		AO2, AO2, LDA
 	add		XO, XO , INC_X
 .endm
 .macro	SAVE_S4
 	fldmias	YO,  { s4 }
 	vmla.f32	s4 , s0, s12
 	fstmias	YO,  { s4 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s5 }
 	vmla.f32	s5 , s0, s13
 	fstmias	YO,  { s5 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s4 }
 	vmla.f32	s4 , s0, s14
 	fstmias	YO,  { s4 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s5 }
 	vmla.f32	s5 , s0, s15
 	fstmias	YO,  { s5 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f32	s12 , s12 , s12
 .endm
 .macro KERNEL_S1X1
 	fldmias	XO  ,  { s2 }
 	fldmias	AO1 ,  { s8 }
 	vmla.f32	s12 , s2 , s8
 	add		AO1, AO1, LDA
 	add		XO, XO , INC_X
 .endm
 .macro	SAVE_S1
 	fldmias	YO,  { s4 }
 	vmla.f32	s4, s0, s12
 	fstmias	YO , { s4 }
 	add	YO, YO, INC_Y
 .endm
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	OLD_M, #0
 	ble	gemvn_kernel_L999
 	cmp	N, #0
 	ble	gemvn_kernel_L999
 	str	OLD_A, A
 	str	OLD_M, M
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	gemvn_kernel_L999
 	cmp	INC_Y, #0
 	beq	gemvn_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #2				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	gemvn_kernel_S4_BEGIN
 	cmp	INC_Y, #1
 	bne	gemvn_kernel_S4_BEGIN
 gemvn_kernel_F4_BEGIN:
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #3					// I = M / 8
 	ble	gemvn_kernel_F1_BEGIN
 gemvn_kernel_F4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #8*SIZE
 	str	r3 , A
 	add	AO2, AO2, LDA
 	add	AO2, AO2, LDA
 	ldr	XO , X
 	INIT_F8
 	asrs	J, N, #3					// J = N / 8
 	ble	gemvn_kernel_F4X1
 gemvn_kernel_F4X4_10:
 	KERNEL_F8X8
 	subs	J, J, #1
 	bne	gemvn_kernel_F4X4_10
 gemvn_kernel_F4X1:
 	ands	J, N , #7
 	ble	gemvn_kernel_F4_END
 gemvn_kernel_F4X1_10:
 	KERNEL_F8X1
 	subs	J, J, #1
 	bne	gemvn_kernel_F4X1_10
 gemvn_kernel_F4_END:
 	SAVE_F8
 	subs	I , I , #1
 	bne	gemvn_kernel_F4X4
 gemvn_kernel_F1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #7
 	ble	gemvn_kernel_L999
 gemvn_kernel_F1X1:
 	ldr	AO1, A
 	add	r3, AO1, #SIZE
 	str	r3, A
 	ldr	XO , X
 	INIT_F1
 	mov	J, N
 gemvn_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	J, J, #1
 	bne	gemvn_kernel_F1X1_10
 gemvn_kernel_F1_END:
 	SAVE_F1
 	subs	I , I , #1
 	bne	gemvn_kernel_F1X1
 	b	gemvn_kernel_L999
 /*************************************************************************************************************/
 gemvn_kernel_S4_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #2					// I = M / 4
 	ble	gemvn_kernel_S1_BEGIN
 gemvn_kernel_S4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #4*SIZE
 	str	r3 , A
 	ldr	XO , X
 	INIT_S4
 	asrs	J, N, #2					// J = N / 4
 	ble	gemvn_kernel_S4X1
 gemvn_kernel_S4X4_10:
 	KERNEL_S4X4
 	subs	J, J, #1
 	bne	gemvn_kernel_S4X4_10
 gemvn_kernel_S4X1:
 	ands	J, N , #3
 	ble	gemvn_kernel_S4_END
 gemvn_kernel_S4X1_10:
 	KERNEL_S4X1
 	subs	J, J, #1
 	bne	gemvn_kernel_S4X1_10
 gemvn_kernel_S4_END:
 	SAVE_S4
 	subs	I , I , #1
 	bne	gemvn_kernel_S4X4
 gemvn_kernel_S1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #3
 	ble	gemvn_kernel_L999
 gemvn_kernel_S1X1:
 	ldr	AO1, A
 	add	r3, AO1, #SIZE
 	str	r3, A
 	ldr	XO , X
 	INIT_S1
 	mov	J, N
 gemvn_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	J, J, #1
 	bne	gemvn_kernel_S1X1_10
 gemvn_kernel_S1_END:
 	SAVE_S1
 	subs	I , I , #1
 	bne	gemvn_kernel_S1X1
 /*************************************************************************************************************/
 gemvn_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@ -0,0 +1,781 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/19 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_M		r0
 #define AO1	r0
 #define N	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define M	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	64
 #define Y_PRE	0
 #define A_PRE	0
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	defined(DOUBLE)
 .macro INIT_F8
 	pld     [ YO , #Y_PRE ]
 	pld     [ YO , #Y_PRE+32 ]
 	vsub.f64	d24 , d24 , d24
 	vmov.f64	d25 , d24
 	vmov.f64	d26 , d24
 	vmov.f64	d27 , d24
 	vmov.f64	d28 , d24
 	vmov.f64	d29 , d24
 	vmov.f64	d30 , d24
 	vmov.f64	d31 , d24
 .endm
 .macro KERNEL_F8X8
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 .endm
 .macro KERNEL_F8X1
 	fldmiad	XO! ,  { d4 }
 	fldmiad	AO1 ,  { d8 - d15 }
 	vmla.f64	d24 , d4 , d8
 	pld	[ AO2 , #A_PRE ]
 	vmla.f64	d25 , d4 , d9
 	pld	[ AO2 , #A_PRE+32 ]
 	vmla.f64	d26 , d4 , d10
 	vmla.f64	d27 , d4 , d11
 	vmla.f64	d28 , d4 , d12
 	vmla.f64	d29 , d4 , d13
 	add		AO1, AO1, LDA
 	vmla.f64	d30 , d4 , d14
 	add		AO2, AO2, LDA
 	vmla.f64	d31 , d4 , d15
 .endm
 .macro	SAVE_F8
 	fldmiad	YO,  { d16 - d23 }
 	vmla.f64	d16, d0, d24
 	vmla.f64	d17, d0, d25
 	vmla.f64	d18, d0, d26
 	vmla.f64	d19, d0, d27
 	vmla.f64	d20, d0, d28
 	vmla.f64	d21, d0, d29
 	vmla.f64	d22, d0, d30
 	vmla.f64	d23, d0, d31
 	fstmiad	YO!, { d16 - d23 }
 .endm
 .macro INIT_F1
 	vsub.f64	d24 , d24 , d24
 .endm
 .macro KERNEL_F1X1
 	fldmiad	XO! ,  { d4 }
 	fldmiad	AO1 ,  { d8 }
 	vmla.f64	d24 , d4 , d8
 	add		AO1, AO1, LDA
 .endm
 .macro	SAVE_F1
 	fldmiad	YO,  { d16 }
 	vmla.f64	d16, d0, d24
 	fstmiad	YO!, { d16 }
 .endm
 /*********************************************************************************************/
 .macro INIT_S8
 	vsub.f64	d24 , d24 , d24
 	vmov.f64	d25 , d24
 	vmov.f64	d26 , d24
 	vmov.f64	d27 , d24
 	vmov.f64	d28 , d24
 	vmov.f64	d29 , d24
 	vmov.f64	d30 , d24
 	vmov.f64	d31 , d24
 .endm
 .macro KERNEL_S8X8
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 .endm
 .macro KERNEL_S8X1
 	pld	[ AO2 , #A_PRE ]
 	pld	[ AO2 , #A_PRE+32 ]
 	fldmiad	XO ,  { d4 }
 	fldmiad	AO1 ,  { d8 - d15 }
 	vmla.f64	d24 , d4 , d8
 	vmla.f64	d25 , d4 , d9
 	vmla.f64	d26 , d4 , d10
 	vmla.f64	d27 , d4 , d11
 	vmla.f64	d28 , d4 , d12
 	vmla.f64	d29 , d4 , d13
 	vmla.f64	d30 , d4 , d14
 	vmla.f64	d31 , d4 , d15
 	add		AO1, AO1, LDA
 	add		AO2, AO2, LDA
 	add		XO, XO, INC_X
 .endm
 .macro	SAVE_S8
 	fldmiad	YO,  { d16 }
 	vmla.f64	d16, d0, d24
 	fstmiad	YO,  { d16 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d17 }
 	vmla.f64	d17, d0, d25
 	fstmiad	YO,  { d17 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d18 }
 	vmla.f64	d18, d0, d26
 	fstmiad	YO,  { d18 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d19 }
 	vmla.f64	d19, d0, d27
 	fstmiad	YO,  { d19 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d20 }
 	vmla.f64	d20, d0, d28
 	fstmiad	YO,  { d20 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d21 }
 	vmla.f64	d21, d0, d29
 	fstmiad	YO,  { d21 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d22 }
 	vmla.f64	d22, d0, d30
 	fstmiad	YO,  { d22 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d23 }
 	vmla.f64	d23, d0, d31
 	fstmiad	YO,  { d23 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f64	d24 , d24 , d24
 .endm
 .macro KERNEL_S1X1
 	fldmiad	XO  ,  { d4 }
 	fldmiad	AO1 ,  { d8 }
 	vmla.f64	d24 , d4 , d8
 	add		AO1, AO1, LDA
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmiad	YO,  { d16 }
 	vmla.f64	d16, d0, d24
 	fstmiad	YO,  { d16 }
 	add	YO, YO, INC_Y
 .endm
 #else	/************************* SINGLE PRECISION *****************************************/
 .macro INIT_F8
 	pld     [ YO , #Y_PRE ]
 	vsub.f32	s24 , s24 , s24
 	vmov.f32	s25 , s24
 	vmov.f32	s26 , s24
 	vmov.f32	s27 , s24
 	vmov.f32	s28 , s24
 	vmov.f32	s29 , s24
 	vmov.f32	s30 , s24
 	vmov.f32	s31 , s24
 .endm
 .macro KERNEL_F8X8
 	pld     [ XO , #X_PRE ]
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 	KERNEL_F8X1
 .endm
 .macro KERNEL_F8X1
 	pld	[ AO2 , #A_PRE ]
 	fldmias	XO! ,  { s4 }
 	fldmias	AO1 ,  { s8 - s15 }
 	vmla.f32	s24 , s4 , s8
 	vmla.f32	s25 , s4 , s9
 	vmla.f32	s26 , s4 , s10
 	vmla.f32	s27 , s4 , s11
 	vmla.f32	s28 , s4 , s12
 	vmla.f32	s29 , s4 , s13
 	vmla.f32	s30 , s4 , s14
 	vmla.f32	s31 , s4 , s15
 	add		AO1, AO1, LDA
 	add		AO2, AO2, LDA
 .endm
 .macro	SAVE_F8
 	fldmias	YO,  { s16 - s23 }
 	vmla.f32	s16, s0, s24
 	vmla.f32	s17, s0, s25
 	vmla.f32	s18, s0, s26
 	vmla.f32	s19, s0, s27
 	vmla.f32	s20, s0, s28
 	vmla.f32	s21, s0, s29
 	vmla.f32	s22, s0, s30
 	vmla.f32	s23, s0, s31
 	fstmias	YO!, { s16 - s23 }
 .endm
 .macro INIT_F1
 	vsub.f32	s24 , s24 , s24
 .endm
 .macro KERNEL_F1X1
 	fldmias	XO! ,  { s4 }
 	fldmias	AO1 ,  { s8 }
 	vmla.f32	s24 , s4 , s8
 	add		AO1, AO1, LDA
 .endm
 .macro	SAVE_F1
 	fldmias	YO,  { s16 }
 	vmla.f32	s16, s0, s24
 	fstmias	YO!, { s16 }
 .endm
 /*********************************************************************************************/
 .macro INIT_S8
 	vsub.f32	s24 , s24 , s24
 	vmov.f32	s25 , s24
 	vmov.f32	s26 , s24
 	vmov.f32	s27 , s24
 	vmov.f32	s28 , s24
 	vmov.f32	s29 , s24
 	vmov.f32	s30 , s24
 	vmov.f32	s31 , s24
 .endm
 .macro KERNEL_S8X8
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 	KERNEL_S8X1
 .endm
 .macro KERNEL_S8X1
 	pld	[ AO2 , #A_PRE ]
 	fldmias	XO ,  { s4 }
 	fldmias	AO1 ,  { s8 - s15 }
 	vmla.f32	s24 , s4 , s8
 	vmla.f32	s25 , s4 , s9
 	vmla.f32	s26 , s4 , s10
 	vmla.f32	s27 , s4 , s11
 	vmla.f32	s28 , s4 , s12
 	vmla.f32	s29 , s4 , s13
 	vmla.f32	s30 , s4 , s14
 	vmla.f32	s31 , s4 , s15
 	add		AO1, AO1, LDA
 	add		AO2, AO2, LDA
 	add		XO, XO, INC_X
 .endm
 .macro	SAVE_S8
 	fldmias	YO,  { s16 }
 	vmla.f32	s16, s0, s24
 	fstmias	YO,  { s16 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s17 }
 	vmla.f32	s17, s0, s25
 	fstmias	YO,  { s17 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s18 }
 	vmla.f32	s18, s0, s26
 	fstmias	YO,  { s18 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s19 }
 	vmla.f32	s19, s0, s27
 	fstmias	YO,  { s19 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s20 }
 	vmla.f32	s20, s0, s28
 	fstmias	YO,  { s20 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s21 }
 	vmla.f32	s21, s0, s29
 	fstmias	YO,  { s21 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s22 }
 	vmla.f32	s22, s0, s30
 	fstmias	YO,  { s22 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s23 }
 	vmla.f32	s23, s0, s31
 	fstmias	YO,  { s23 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f32	s24 , s24 , s24
 .endm
 .macro KERNEL_S1X1
 	fldmias	XO  ,  { s4 }
 	fldmias	AO1 ,  { s8 }
 	vmla.f32	s24 , s4 , s8
 	add		AO1, AO1, LDA
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmias	YO,  { s16 }
 	vmla.f32	s16, s0, s24
 	fstmias	YO,  { s16 }
 	add	YO, YO, INC_Y
 .endm
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s31 }                                 // store floating point registers
 #endif
 	cmp	OLD_M, #0
 	ble	gemvn_kernel_L999
 	cmp	N, #0
 	ble	gemvn_kernel_L999
 	str	OLD_A, A
 	str	OLD_M, M
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	gemvn_kernel_L999
 	cmp	INC_Y, #0
 	beq	gemvn_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #2				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	gemvn_kernel_S8_BEGIN
 	cmp	INC_Y, #1
 	bne	gemvn_kernel_S8_BEGIN
 gemvn_kernel_F8_BEGIN:
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #3					// I = M / 8
 	ble	gemvn_kernel_F1_BEGIN
 gemvn_kernel_F8X8:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #8*SIZE
 	str	r3 , A
 	ldr	XO , X
 	INIT_F8
 	asrs	J, N, #3					// J = N / 8
 	ble	gemvn_kernel_F8X1
 gemvn_kernel_F8X8_10:
 	KERNEL_F8X8
 	subs	J, J, #1
 	bne	gemvn_kernel_F8X8_10
 gemvn_kernel_F8X1:
 	ands	J, N , #7
 	ble	gemvn_kernel_F8_END
 gemvn_kernel_F8X1_10:
 	KERNEL_F8X1
 	subs	J, J, #1
 	bne	gemvn_kernel_F8X1_10
 gemvn_kernel_F8_END:
 	SAVE_F8
 	subs	I , I , #1
 	bne	gemvn_kernel_F8X8
 gemvn_kernel_F1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #7
 	ble	gemvn_kernel_L999
 gemvn_kernel_F1X1:
 	ldr	AO1, A
 	add	r3, AO1, #SIZE
 	str	r3, A
 	ldr	XO , X
 	INIT_F1
 	mov	J, N
 gemvn_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	J, J, #1
 	bne	gemvn_kernel_F1X1_10
 gemvn_kernel_F1_END:
 	SAVE_F1
 	subs	I , I , #1
 	bne	gemvn_kernel_F1X1
 	b	gemvn_kernel_L999
 /*************************************************************************************************************/
 gemvn_kernel_S8_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #3					// I = M / 8
 	ble	gemvn_kernel_S1_BEGIN
 gemvn_kernel_S8X8:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #8*SIZE
 	str	r3 , A
 	ldr	XO , X
 	INIT_S8
 	asrs	J, N, #3					// J = N / 8
 	ble	gemvn_kernel_S8X1
 gemvn_kernel_S8X8_10:
 	KERNEL_S8X8
 	subs	J, J, #1
 	bne	gemvn_kernel_S8X8_10
 gemvn_kernel_S8X1:
 	ands	J, N , #7
 	ble	gemvn_kernel_S8_END
 gemvn_kernel_S8X1_10:
 	KERNEL_S8X1
 	subs	J, J, #1
 	bne	gemvn_kernel_S8X1_10
 gemvn_kernel_S8_END:
 	SAVE_S8
 	subs	I , I , #1
 	bne	gemvn_kernel_S8X8
 gemvn_kernel_S1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #7
 	ble	gemvn_kernel_L999
 gemvn_kernel_S1X1:
 	ldr	AO1, A
 	add	r3, AO1, #SIZE
 	str	r3, A
 	ldr	XO , X
 	INIT_S1
 	mov	J, N
 gemvn_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	J, J, #1
 	bne	gemvn_kernel_S1X1_10
 gemvn_kernel_S1_END:
 	SAVE_S1
 	subs	I , I , #1
 	bne	gemvn_kernel_S1X1
 /*************************************************************************************************************/
 gemvn_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s31 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/gemv_t.c
+++ b/kernel/arm/gemv_t.c
@ -0,0 +1,67 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * * 2013/09/14 Saar
 * *	 BLASTEST float		: OK
 * * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 * *
 * **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG ix,iy;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT temp;
 	iy = 0;
 	a_ptr = a;
 	for (j=0; j<n; j++)
 	{
 		temp = 0.0;
 		ix = 0;
 		for (i=0; i<m; i++)
 		{
 			temp += a_ptr[i] * x[ix];
 			ix    += inc_x;
 		}
 		y[iy] += alpha * temp;
 		iy += inc_y;
 		a_ptr += lda;
 	}
 }
--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@ -0,0 +1,750 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/25 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_N		r1
 #define M	r0
 #define AO1	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define N	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	512
 #define A_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	defined(DOUBLE)
 .macro INIT_F2
 	vsub.f64	d2 , d2 , d2
 	vsub.f64	d3 , d3 , d3
 .endm
 .macro KERNEL_F2X4
 	pld	[ XO  , #X_PRE ]
 	fldmiad	XO! ,  { d12 - d15 }
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	AO2!,  { d4 - d5 }
 	fldmiad	AO1!,  { d10 - d11 }
 	fldmiad	AO2!,  { d6 - d7 }
 	vmla.f64	d2 , d12 , d8
 	vmla.f64	d3 , d12 , d4
 	vmla.f64	d2 , d13 , d9
 	vmla.f64	d3 , d13 , d5
 	vmla.f64	d2 , d14, d10
 	vmla.f64	d3 , d14, d6
 	vmla.f64	d2 , d15, d11
 	vmla.f64	d3 , d15, d7
 .endm
 .macro KERNEL_F2X1
 	fldmiad	XO! ,  { d1 }
 	fldmiad	AO1!,  { d8 }
 	fldmiad	AO2!,  { d4 }
 	vmla.f64	d2 , d1 , d8
 	vmla.f64	d3 , d1 , d4
 .endm
 .macro	SAVE_F2
 	fldmiad	YO,  { d4 - d5 }
 	vmla.f64	d4, d0, d2
 	vmla.f64	d5, d0, d3
 	fstmiad	YO!, { d4 - d5 }
 .endm
 .macro INIT_F1
 	vsub.f64	d2 , d2 , d2
 .endm
 .macro KERNEL_F1X4
 	pld	[ XO  , #X_PRE ]
 	fldmiad	XO! ,  { d12 - d15 }
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	fldmiad	AO1!,  { d10 - d11 }
 	vmla.f64	d2 , d12 , d8
 	vmla.f64	d2 , d13 , d9
 	vmla.f64	d2 , d14, d10
 	vmla.f64	d2 , d15, d11
 .endm
 .macro KERNEL_F1X1
 	fldmiad	XO! ,  { d1 }
 	fldmiad	AO1!,  { d8 }
 	vmla.f64	d2 , d1 , d8
 .endm
 .macro	SAVE_F1
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4, d0, d2
 	fstmiad	YO!, { d4 }
 .endm
 .macro INIT_S2
 	vsub.f64	d2 , d2 , d2
 	vsub.f64	d3 , d3 , d3
 .endm
 .macro KERNEL_S2X4
 	fldmiad	XO ,  { d12 }
 	add	XO, XO, INC_X
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	AO2!,  { d4 - d5 }
 	fldmiad	XO ,  { d13 }
 	add	XO, XO, INC_X
 	fldmiad	AO1!,  { d10 - d11 }
 	fldmiad	AO2!,  { d6 - d7 }
 	fldmiad	XO ,  { d14 }
 	add	XO, XO, INC_X
 	fldmiad	XO ,  { d15 }
 	add	XO, XO, INC_X
 	vmla.f64	d2 , d12 , d8
 	vmla.f64	d3 , d12 , d4
 	vmla.f64	d2 , d13 , d9
 	vmla.f64	d3 , d13 , d5
 	vmla.f64	d2 , d14, d10
 	vmla.f64	d3 , d14, d6
 	vmla.f64	d2 , d15, d11
 	vmla.f64	d3 , d15, d7
 .endm
 .macro KERNEL_S2X1
 	fldmiad	XO ,  { d1 }
 	fldmiad	AO1!,  { d8 }
 	fldmiad	AO2!,  { d4 }
 	vmla.f64	d2 , d1 , d8
 	add	XO, XO, INC_X
 	vmla.f64	d3 , d1 , d4
 .endm
 .macro	SAVE_S2
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4, d0, d2
 	fstmiad	YO, { d4  }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d5 }
 	vmla.f64	d5, d0, d3
 	fstmiad	YO, { d5  }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f64	d2 , d2 , d2
 .endm
 .macro KERNEL_S1X4
 	fldmiad	XO ,  { d12 }
 	add	XO, XO, INC_X
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	fldmiad	XO ,  { d13 }
 	add	XO, XO, INC_X
 	fldmiad	AO1!,  { d10 - d11 }
 	fldmiad	XO ,  { d14 }
 	add	XO, XO, INC_X
 	fldmiad	XO ,  { d15 }
 	add	XO, XO, INC_X
 	vmla.f64	d2 , d12 , d8
 	vmla.f64	d2 , d13 , d9
 	vmla.f64	d2 , d14, d10
 	vmla.f64	d2 , d15, d11
 .endm
 .macro KERNEL_S1X1
 	fldmiad	XO ,  { d1 }
 	fldmiad	AO1!,  { d8 }
 	vmla.f64	d2 , d1 , d8
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmiad	YO,  { d4 }
 	vmla.f64	d4, d0, d2
 	fstmiad	YO, { d4  }
 	add	YO, YO, INC_Y
 .endm
 #else	/************************* SINGLE PRECISION *****************************************/
 .macro INIT_F2
 	vsub.f32	s2 , s2 , s2
 	vsub.f32	s3 , s3 , s3
 .endm
 .macro KERNEL_F2X4
 	fldmias	XO! ,  { s12 - s15 }
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	AO2!,  { s4 - s5 }
 	fldmias	AO1!,  { s10 - s11 }
 	fldmias	AO2!,  { s6 - s7 }
 	vmla.f32	s2 , s12 , s8
 	vmla.f32	s3 , s12 , s4
 	vmla.f32	s2 , s13 , s9
 	vmla.f32	s3 , s13 , s5
 	vmla.f32	s2 , s14, s10
 	vmla.f32	s3 , s14, s6
 	vmla.f32	s2 , s15, s11
 	vmla.f32	s3 , s15, s7
 .endm
 .macro KERNEL_F2X1
 	fldmias	XO! ,  { s1 }
 	fldmias	AO1!,  { s8 }
 	fldmias	AO2!,  { s4 }
 	vmla.f32	s2 , s1 , s8
 	vmla.f32	s3 , s1 , s4
 .endm
 .macro	SAVE_F2
 	fldmias	YO,  { s4 - s5 }
 	vmla.f32	s4, s0, s2
 	vmla.f32	s5, s0, s3
 	fstmias	YO!, { s4 - s5 }
 .endm
 .macro INIT_F1
 	vsub.f32	s2 , s2 , s2
 .endm
 .macro KERNEL_F1X4
 	fldmias	XO! ,  { s12 - s15 }
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	AO1!,  { s10 - s11 }
 	vmla.f32	s2 , s12 , s8
 	vmla.f32	s2 , s13 , s9
 	vmla.f32	s2 , s14, s10
 	vmla.f32	s2 , s15, s11
 .endm
 .macro KERNEL_F1X1
 	fldmias	XO! ,  { s1 }
 	fldmias	AO1!,  { s8 }
 	vmla.f32	s2 , s1 , s8
 .endm
 .macro	SAVE_F1
 	fldmias	YO,  { s4 }
 	vmla.f32	s4, s0, s2
 	fstmias	YO!, { s4 }
 .endm
 .macro INIT_S2
 	vsub.f32	s2 , s2 , s2
 	vsub.f32	s3 , s3 , s3
 .endm
 .macro KERNEL_S2X4
 	fldmias	XO ,  { s12 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	AO2!,  { s4 - s5 }
 	fldmias	XO ,  { s13 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s10 - s11 }
 	fldmias	AO2!,  { s6 - s7 }
 	fldmias	XO ,  { s14 }
 	add	XO, XO, INC_X
 	fldmias	XO ,  { s15 }
 	add	XO, XO, INC_X
 	vmla.f32	s2 , s12 , s8
 	vmla.f32	s3 , s12 , s4
 	vmla.f32	s2 , s13 , s9
 	vmla.f32	s3 , s13 , s5
 	vmla.f32	s2 , s14, s10
 	vmla.f32	s3 , s14, s6
 	vmla.f32	s2 , s15, s11
 	vmla.f32	s3 , s15, s7
 .endm
 .macro KERNEL_S2X1
 	fldmias	XO ,  { s1 }
 	fldmias	AO1!,  { s8 }
 	fldmias	AO2!,  { s4 }
 	vmla.f32	s2 , s1 , s8
 	add	XO, XO, INC_X
 	vmla.f32	s3 , s1 , s4
 .endm
 .macro	SAVE_S2
 	fldmias	YO,  { s4 }
 	vmla.f32	s4, s0, s2
 	fstmias	YO, { s4  }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s5 }
 	vmla.f32	s5, s0, s3
 	fstmias	YO, { s5  }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
 	vsub.f32	s2 , s2 , s2
 .endm
 .macro KERNEL_S1X4
 	fldmias	XO ,  { s12 }
 	add	XO, XO, INC_X
 	pld	[ AO1 , #A_PRE ]
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	XO ,  { s13 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s10 - s11 }
 	fldmias	XO ,  { s14 }
 	add	XO, XO, INC_X
 	fldmias	XO ,  { s15 }
 	add	XO, XO, INC_X
 	vmla.f32	s2 , s12 , s8
 	vmla.f32	s2 , s13 , s9
 	vmla.f32	s2 , s14, s10
 	vmla.f32	s2 , s15, s11
 .endm
 .macro KERNEL_S1X1
 	fldmias	XO ,  { s1 }
 	fldmias	AO1!,  { s8 }
 	vmla.f32	s2 , s1 , s8
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmias	YO,  { s4 }
 	vmla.f32	s4, s0, s2
 	fstmias	YO, { s4  }
 	add	YO, YO, INC_Y
 .endm
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	M, #0
 	ble	gemvt_kernel_L999
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999
 	str	OLD_A, A
 	str	OLD_N, N
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	gemvt_kernel_L999
 	cmp	INC_Y, #0
 	beq	gemvt_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #2				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	gemvt_kernel_S2_BEGIN
 	cmp	INC_Y, #1
 	bne	gemvt_kernel_S2_BEGIN
 gemvt_kernel_F2_BEGIN:
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	gemvt_kernel_F1_BEGIN
 gemvt_kernel_F2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_F2
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_F2X1
 gemvt_kernel_F2X4_10:
 	KERNEL_F2X4
 	subs	I, I, #1
 	bne	gemvt_kernel_F2X4_10
 gemvt_kernel_F2X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_F2_END
 gemvt_kernel_F2X1_10:
 	KERNEL_F2X1
 	subs	I, I, #1
 	bne	gemvt_kernel_F2X1_10
 gemvt_kernel_F2_END:
 	SAVE_F2
 	subs	J , J , #1
 	bne	gemvt_kernel_F2X4
 gemvt_kernel_F1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	gemvt_kernel_L999
 gemvt_kernel_F1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_F1
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_F1X1
 gemvt_kernel_F1X4_10:
 	KERNEL_F1X4
 	subs	I, I, #1
 	bne	gemvt_kernel_F1X4_10
 gemvt_kernel_F1X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_F1_END
 gemvt_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	I, I, #1
 	bne	gemvt_kernel_F1X1_10
 gemvt_kernel_F1_END:
 	SAVE_F1
 	b	gemvt_kernel_L999
 /*************************************************************************************************************/
 gemvt_kernel_S2_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	gemvt_kernel_S1_BEGIN
 gemvt_kernel_S2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_S2
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_S2X1
 gemvt_kernel_S2X4_10:
 	KERNEL_S2X4
 	subs	I, I, #1
 	bne	gemvt_kernel_S2X4_10
 gemvt_kernel_S2X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_S2_END
 gemvt_kernel_S2X1_10:
 	KERNEL_S2X1
 	subs	I, I, #1
 	bne	gemvt_kernel_S2X1_10
 gemvt_kernel_S2_END:
 	SAVE_S2
 	subs	J , J , #1
 	bne	gemvt_kernel_S2X4
 gemvt_kernel_S1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	gemvt_kernel_L999
 gemvt_kernel_S1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_S1
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_S1X1
 gemvt_kernel_S1X4_10:
 	KERNEL_S1X4
 	subs	I, I, #1
 	bne	gemvt_kernel_S1X4_10
 gemvt_kernel_S1X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_S1_END
 gemvt_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	I, I, #1
 	bne	gemvt_kernel_S1X1_10
 gemvt_kernel_S1_END:
 	SAVE_S1
 /*************************************************************************************************************/
 gemvt_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@ -0,0 +1,732 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/18 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_N		r1
 #define M	r0
 #define AO1	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define N	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	512
 #define A_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	defined(DOUBLE)
 .macro INIT_F2
 	vsub.f64	d4 , d4 , d4
 	vsub.f64	d5 , d5 , d5
 .endm
 .macro KERNEL_F2X4
 	pld	[ XO  , #X_PRE ]
 	fldmiad	XO! ,  { d28 - d31 }
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	AO2!,  { d16 - d17 }
 	vmla.f64	d4 , d28 , d8
 	vmla.f64	d5 , d28 , d16
 	fldmiad	AO1!,  { d10 - d11 }
 	vmla.f64	d4 , d29 , d9
 	vmla.f64	d5 , d29 , d17
 	fldmiad	AO2!,  { d18 - d19 }
 	vmla.f64	d4 , d30, d10
 	vmla.f64	d5 , d30, d18
 	vmla.f64	d4 , d31, d11
 	vmla.f64	d5 , d31, d19
 .endm
 .macro KERNEL_F2X1
 	fldmiad	XO! ,  { d2 }
 	fldmiad	AO1!,  { d8 }
 	fldmiad	AO2!,  { d16 }
 	vmla.f64	d4 , d2 , d8
 	vmla.f64	d5 , d2 , d16
 .endm
 .macro	SAVE_F2
 	fldmiad	YO,  { d24 - d25 }
 	vmla.f64	d24, d0, d4
 	vmla.f64	d25, d0, d5
 	fstmiad	YO!, { d24 - d25 }
 .endm
 .macro INIT_S2
 	vsub.f64	d4 , d4 , d4
 	vsub.f64	d5 , d5 , d5
 .endm
 .macro KERNEL_S2X4
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	XO ,  { d28 }
 	add	XO, XO, INC_X
 	fldmiad	AO1!,  { d8 - d9   }
 	pld	[ AO2 , #A_PRE ]
 	fldmiad	AO2!,  { d16 - d17 }
 	vmla.f64	d4 , d28 , d8
 	fldmiad	XO ,  { d29 }
 	add	XO, XO, INC_X
 	vmla.f64	d5 , d28 , d16
 	fldmiad	AO1!,  { d10 - d11 }
 	vmla.f64	d4 , d29 , d9
 	fldmiad	XO ,  { d30 }
 	add	XO, XO, INC_X
 	vmla.f64	d5 , d29 , d17
 	fldmiad	AO2!,  { d18 - d19 }
 	vmla.f64	d4 , d30, d10
 	fldmiad	XO ,  { d31 }
 	add	XO, XO, INC_X
 	vmla.f64	d5 , d30, d18
 	vmla.f64	d4 , d31, d11
 	vmla.f64	d5 , d31, d19
 .endm
 .macro KERNEL_S2X1
 	fldmiad	XO ,  { d2 }
 	fldmiad	AO1!,  { d8 }
 	add	XO, XO, INC_X
 	fldmiad	AO2!,  { d16 }
 	vmla.f64	d4 , d2 , d8
 	vmla.f64	d5 , d2 , d16
 .endm
 .macro	SAVE_S2
 	fldmiad	YO,  { d24 }
 	vmla.f64	d24, d0, d4
 	fstmiad	YO,  { d24 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d24 }
 	vmla.f64	d24, d0, d5
 	fstmiad	YO,  { d24 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_F1
 	vsub.f64	d4 , d4 , d4
 .endm
 .macro KERNEL_F1X4
 	pld	[ XO  , #X_PRE ]
 	fldmiad	XO! ,  { d28 - d31 }
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	AO1!,  { d8 - d9   }
 	vmla.f64	d4 , d28 , d8
 	fldmiad	AO1!,  { d10 - d11 }
 	vmla.f64	d4 , d29 , d9
 	vmla.f64	d4 , d30, d10
 	vmla.f64	d4 , d31, d11
 .endm
 .macro KERNEL_F1X1
 	fldmiad	XO! ,  { d2 }
 	fldmiad	AO1!,  { d8 }
 	vmla.f64	d4 , d2 , d8
 .endm
 .macro	SAVE_F1
 	fldmiad	YO,  { d24 }
 	vmla.f64	d24, d0, d4
 	fstmiad	YO!, { d24 }
 .endm
 .macro INIT_S1
 	vsub.f64	d4 , d4 , d4
 .endm
 .macro KERNEL_S1X4
 	pld	[ AO1 , #A_PRE ]
 	fldmiad	XO ,  { d28 }
 	add	XO, XO, INC_X
 	fldmiad	AO1!,  { d8 - d9   }
 	vmla.f64	d4 , d28 , d8
 	fldmiad	XO ,  { d29 }
 	add	XO, XO, INC_X
 	fldmiad	AO1!,  { d10 - d11 }
 	vmla.f64	d4 , d29 , d9
 	fldmiad	XO ,  { d30 }
 	add	XO, XO, INC_X
 	vmla.f64	d4 , d30, d10
 	fldmiad	XO ,  { d31 }
 	add	XO, XO, INC_X
 	vmla.f64	d4 , d31, d11
 .endm
 .macro KERNEL_S1X1
 	fldmiad	XO ,  { d2 }
 	fldmiad	AO1!,  { d8 }
 	add	XO, XO, INC_X
 	vmla.f64	d4 , d2 , d8
 .endm
 .macro	SAVE_S1
 	fldmiad	YO,  { d24 }
 	vmla.f64	d24, d0, d4
 	fstmiad	YO,  { d24 }
 	add	YO, YO, INC_Y
 .endm
 #else	/************************* SINGLE PRECISION *****************************************/
 .macro INIT_F2
 	vsub.f32	s4 , s4 , s4
 	vsub.f32	s5 , s5 , s5
 .endm
 .macro KERNEL_F2X4
 	fldmias	XO! ,  { s28 - s31 }
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	AO2!,  { s16 - s17 }
 	vmla.f32	s4 , s28 , s8
 	vmla.f32	s5 , s28 , s16
 	fldmias	AO1!,  { s10 - s11 }
 	vmla.f32	s4 , s29 , s9
 	vmla.f32	s5 , s29 , s17
 	fldmias	AO2!,  { s18 - s19 }
 	vmla.f32	s4 , s30, s10
 	vmla.f32	s5 , s30, s18
 	vmla.f32	s4 , s31, s11
 	vmla.f32	s5 , s31, s19
 .endm
 .macro KERNEL_F2X1
 	fldmias	XO! ,  { s2 }
 	fldmias	AO1!,  { s8 }
 	fldmias	AO2!,  { s16 }
 	vmla.f32	s4 , s2 , s8
 	vmla.f32	s5 , s2 , s16
 .endm
 .macro	SAVE_F2
 	fldmias	YO,  { s24 - s25 }
 	vmla.f32	s24, s0, s4
 	vmla.f32	s25, s0, s5
 	fstmias	YO!, { s24 - s25 }
 .endm
 .macro INIT_S2
 	vsub.f32	s4 , s4 , s4
 	vsub.f32	s5 , s5 , s5
 .endm
 .macro KERNEL_S2X4
 	fldmias	XO ,  { s28 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s8 - s9   }
 	fldmias	AO2!,  { s16 - s17 }
 	vmla.f32	s4 , s28 , s8
 	fldmias	XO ,  { s29 }
 	add	XO, XO, INC_X
 	vmla.f32	s5 , s28 , s16
 	fldmias	AO1!,  { s10 - s11 }
 	vmla.f32	s4 , s29 , s9
 	fldmias	XO ,  { s30 }
 	add	XO, XO, INC_X
 	vmla.f32	s5 , s29 , s17
 	fldmias	AO2!,  { s18 - s19 }
 	vmla.f32	s4 , s30, s10
 	fldmias	XO ,  { s31 }
 	add	XO, XO, INC_X
 	vmla.f32	s5 , s30, s18
 	vmla.f32	s4 , s31, s11
 	vmla.f32	s5 , s31, s19
 .endm
 .macro KERNEL_S2X1
 	fldmias	XO ,  { s2 }
 	fldmias	AO1!,  { s8 }
 	add	XO, XO, INC_X
 	fldmias	AO2!,  { s16 }
 	vmla.f32	s4 , s2 , s8
 	vmla.f32	s5 , s2 , s16
 .endm
 .macro	SAVE_S2
 	fldmias	YO,  { s24 }
 	vmla.f32	s24, s0, s4
 	fstmias	YO,  { s24 }
 	add	YO, YO, INC_Y
 	fldmias	YO,  { s24 }
 	vmla.f32	s24, s0, s5
 	fstmias	YO,  { s24 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_F1
 	vsub.f32	s4 , s4 , s4
 .endm
 .macro KERNEL_F1X4
 	fldmias	XO! ,  { s28 - s31 }
 	fldmias	AO1!,  { s8 - s9   }
 	vmla.f32	s4 , s28 , s8
 	fldmias	AO1!,  { s10 - s11 }
 	vmla.f32	s4 , s29 , s9
 	vmla.f32	s4 , s30, s10
 	vmla.f32	s4 , s31, s11
 .endm
 .macro KERNEL_F1X1
 	fldmias	XO! ,  { s2 }
 	fldmias	AO1!,  { s8 }
 	vmla.f32	s4 , s2 , s8
 .endm
 .macro	SAVE_F1
 	fldmias	YO,  { s24 }
 	vmla.f32	s24, s0, s4
 	fstmias	YO!, { s24 }
 .endm
 .macro INIT_S1
 	vsub.f32	s4 , s4 , s4
 .endm
 .macro KERNEL_S1X4
 	fldmias	XO ,  { s28 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s8 - s9   }
 	vmla.f32	s4 , s28 , s8
 	fldmias	XO ,  { s29 }
 	add	XO, XO, INC_X
 	fldmias	AO1!,  { s10 - s11 }
 	vmla.f32	s4 , s29 , s9
 	fldmias	XO ,  { s30 }
 	add	XO, XO, INC_X
 	vmla.f32	s4 , s30, s10
 	fldmias	XO ,  { s31 }
 	add	XO, XO, INC_X
 	vmla.f32	s4 , s31, s11
 .endm
 .macro KERNEL_S1X1
 	fldmias	XO ,  { s2 }
 	fldmias	AO1!,  { s8 }
 	add	XO, XO, INC_X
 	vmla.f32	s4 , s2 , s8
 .endm
 .macro	SAVE_S1
 	fldmias	YO,  { s24 }
 	vmla.f32	s24, s0, s4
 	fstmias	YO,  { s24 }
 	add	YO, YO, INC_Y
 .endm
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s31 }                                 // store floating point registers
 #endif
 	cmp	M, #0
 	ble	gemvt_kernel_L999
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999
 	str	OLD_A, A
 	str	OLD_N, N
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	gemvt_kernel_L999
 	cmp	INC_Y, #0
 	beq	gemvt_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #2				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	gemvt_kernel_S2_BEGIN
 	cmp	INC_Y, #1
 	bne	gemvt_kernel_S2_BEGIN
 gemvt_kernel_F2_BEGIN:
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	gemvt_kernel_F1_BEGIN
 gemvt_kernel_F2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_F2
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_F2X1
 gemvt_kernel_F2X4_10:
 	KERNEL_F2X4
 	subs	I, I, #1
 	bne	gemvt_kernel_F2X4_10
 gemvt_kernel_F2X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_F2_END
 gemvt_kernel_F2X1_10:
 	KERNEL_F2X1
 	subs	I, I, #1
 	bne	gemvt_kernel_F2X1_10
 gemvt_kernel_F2_END:
 	SAVE_F2
 	subs	J , J , #1
 	bne	gemvt_kernel_F2X4
 gemvt_kernel_F1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	gemvt_kernel_L999
 gemvt_kernel_F1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_F1
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_F1X1
 gemvt_kernel_F1X4_10:
 	KERNEL_F1X4
 	subs	I, I, #1
 	bne	gemvt_kernel_F1X4_10
 gemvt_kernel_F1X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_F1_END
 gemvt_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	I, I, #1
 	bne	gemvt_kernel_F1X1_10
 gemvt_kernel_F1_END:
 	SAVE_F1
 	b	gemvt_kernel_L999
 /*************************************************************************************************************/
 gemvt_kernel_S2_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	gemvt_kernel_S1_BEGIN
 gemvt_kernel_S2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_S2
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_S2X1
 gemvt_kernel_S2X4_10:
 	KERNEL_S2X4
 	subs	I, I, #1
 	bne	gemvt_kernel_S2X4_10
 gemvt_kernel_S2X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_S2_END
 gemvt_kernel_S2X1_10:
 	KERNEL_S2X1
 	subs	I, I, #1
 	bne	gemvt_kernel_S2X1_10
 gemvt_kernel_S2_END:
 	SAVE_S2
 	subs	J , J , #1
 	bne	gemvt_kernel_S2X4
 gemvt_kernel_S1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	gemvt_kernel_L999
 gemvt_kernel_S1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_S1
 	asrs	I, M, #2					// I = M / 4
 	ble	gemvt_kernel_S1X1
 gemvt_kernel_S1X4_10:
 	KERNEL_S1X4
 	subs	I, I, #1
 	bne	gemvt_kernel_S1X4_10
 gemvt_kernel_S1X1:
 	ands	I, M , #3
 	ble	gemvt_kernel_S1_END
 gemvt_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	I, I, #1
 	bne	gemvt_kernel_S1X1_10
 gemvt_kernel_S1_END:
 	SAVE_S1
 /*************************************************************************************************************/
 gemvt_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s31 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/iamax.c
+++ b/kernel/arm/iamax.c
@ -0,0 +1,75 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	BLASLONG max=0;
 	if (n < 0 || inc_x < 1 ) return(max);
 	maxf=ABS(x[0]);
 	while(i < n)
 	{
 		if( ABS(x[ix]) > ABS(maxf) ) 
 		{
 			max = i;
 			maxf = ABS(x[ix]);
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(max+1);
 }
--- a/kernel/arm/iamax_vfp.S
+++ b/kernel/arm/iamax_vfp.S
@ -0,0 +1,478 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/14 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define INDEX	r3
 #define Z	r4
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	defined(USE_ABS)
 #if	defined(DOUBLE)
 #define	VABS(x0,x1)	vabs.f64	x0, x1
 #else
 #define	VABS(x0,x1)	vabs.f32	x0, x1
 #endif
 #else
 #define VABS(x0,x1)	nop
 #endif
 /*****************************************************************************************/
 #if	defined(USE_MIN)
 #define	MOVCOND		movlt
 #if	defined(DOUBLE)
 #define	VMOVCOND	vmovlt.f64
 #else
 #define	VMOVCOND	vmovlt.f32
 #endif
 #else
 #define	MOVCOND		movgt
 #if	defined(DOUBLE)
 #define	VMOVCOND	vmovgt.f64
 #else
 #define	VMOVCOND	vmovgt.f32
 #endif
 #endif
 /*****************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro INIT_F
 	fldmiad	X!, { d0 }
 	VABS(   d0,  d0 )
 	mov	Z, #1
 	mov	INDEX, Z
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 }
 	add	Z, Z, #1
 	VABS(   d4,  d4 )
 	vcmpe.f64  	d4,  d0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	d0,  d4
 	MOVCOND		INDEX, Z
 .endm
 .macro INIT_S
 	fldmiad	X, { d0 }
 	VABS(   d0,  d0 )
 	mov	Z, #1
 	mov	INDEX, Z
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 }
 	add	Z, Z, #1
 	VABS(   d4,  d4 )
 	vcmpe.f64  	d4,  d0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	d0,  d4
 	MOVCOND		INDEX, Z
 	add	X, X, INC_X
 .endm
 #else
 .macro INIT_F
 	fldmias	X!, { s0 }
 	VABS(   s0,  s0 )
 	mov	Z, #1
 	mov	INDEX, Z
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 }
 	add	Z, Z, #1
 	VABS(   s4,  s4 )
 	vcmpe.f32  	s4,  s0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	s0,  s4
 	MOVCOND		INDEX, Z
 .endm
 .macro INIT_S
 	fldmias	X, { s0 }
 	VABS(   s0,  s0 )
 	mov	Z, #1
 	mov	INDEX, Z
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 }
 	add	Z, Z, #1
 	VABS(   s4,  s4 )
 	vcmpe.f32  	s4,  s0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	s0,  s4
 	MOVCOND		INDEX, Z
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro INIT_F
 	fldmiad	X!, { d0 -d1 }
 	vabs.f64   d0,  d0
 	vabs.f64   d1,  d1
 	vadd.f64   d0  , d0,  d1
 	mov	Z, #1
 	mov	INDEX, Z
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 - d5 }
 	add	Z, Z, #1
 	vabs.f64   d4,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d4  , d4,  d5
 	vcmpe.f64  	d4,  d0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	d0,  d4
 	MOVCOND		INDEX, Z
 .endm
 .macro INIT_S
 	fldmiad	X, { d0 -d1 }
 	vabs.f64   d0,  d0
 	vabs.f64   d1,  d1
 	vadd.f64   d0  , d0,  d1
 	mov	Z, #1
 	mov	INDEX, Z
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 - d5 }
 	add	Z, Z, #1
 	vabs.f64   d4,  d4
 	vabs.f64   d5,  d5
 	vadd.f64   d4  , d4,  d5
 	vcmpe.f64  	d4,  d0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	d0,  d4
 	MOVCOND		INDEX, Z
 	add	X, X, INC_X
 .endm
 #else
 .macro INIT_F
 	fldmias	X!, { s0 -s1 }
 	vabs.f32   s0,  s0
 	vabs.f32   s1,  s1
 	vadd.f32   s0  , s0,  s1
 	mov	Z, #1
 	mov	INDEX, Z
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 - s5 }
 	add	Z, Z, #1
 	vabs.f32   s4,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s4  , s4,  s5
 	vcmpe.f32  	s4,  s0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	s0,  s4
 	MOVCOND		INDEX, Z
 .endm
 .macro INIT_S
 	fldmias	X, { s0 -s1 }
 	vabs.f32   s0,  s0
 	vabs.f32   s1,  s1
 	vadd.f32   s0  , s0,  s1
 	mov	Z, #1
 	mov	INDEX, Z
 	add	X, X, INC_X
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 - s5 }
 	add	Z, Z, #1
 	vabs.f32   s4,  s4
 	vabs.f32   s5,  s5
 	vadd.f32   s4  , s4,  s5
 	vcmpe.f32  	s4,  s0
 	vmrs		APSR_nzcv, fpscr
 	VMOVCOND	s0,  s4
 	MOVCOND		INDEX, Z
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4}
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0
 #else
 	vsub.f32                s0 , s0 , s0
 #endif
 	mov	INDEX, #0
 	cmp	N, #0
 	ble	iamax_kernel_L999
 	cmp	INC_X, #0
 	beq	iamax_kernel_L999
 	cmp	INC_X, #1
 	bne	iamax_kernel_S_BEGIN
 iamax_kernel_F_BEGIN:
 	INIT_F
 	subs	N, N , #1
 	ble	iamax_kernel_L999
 	asrs	I, N, #2					// I = N / 4
 	ble	iamax_kernel_F1
 	.align 5
 iamax_kernel_F4:
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 #if defined(COMPLEX) && defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 #endif
 	KERNEL_F1
 	KERNEL_F1
 	subs	I, I, #1
 	ble	iamax_kernel_F1
 #if defined(COMPLEX) || defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 #endif
 	KERNEL_F1
 	KERNEL_F1
 #if defined(COMPLEX) && defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 #endif
 	KERNEL_F1
 	KERNEL_F1
 	subs	I, I, #1
 	bne	iamax_kernel_F4
 iamax_kernel_F1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     iamax_kernel_F10
 	b	iamax_kernel_L999
 iamax_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 	INIT_S
 	subs	N, N , #1
 	ble	iamax_kernel_L999
 	asrs	I, N, #2					// I = N / 4
 	ble	iamax_kernel_S1
 	.align 5
 iamax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	iamax_kernel_S4
 iamax_kernel_S1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     iamax_kernel_S10
 iamax_kernel_L999:
 	mov	r0, INDEX		// set return value
 	pop     {r4}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/iamin.c
+++ b/kernel/arm/iamin.c
@ -0,0 +1,75 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	BLASLONG min=0;
 	if (n < 0 || inc_x < 1 ) return(min);
 	minf=ABS(x[0]);
 	while(i < n)
 	{
 		if( ABS(x[ix]) < ABS(minf) ) 
 		{
 			min = i;
 			minf = ABS(x[ix]);
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(min+1);
 }
--- a/kernel/arm/imax.c
+++ b/kernel/arm/imax.c
@ -0,0 +1,67 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	BLASLONG max=0;
 	if (n < 0 || inc_x < 1 ) return(max);
 	maxf=x[0];
 	while(i < n)
 	{
 		if( x[ix] > maxf ) 
 		{
 			max = i;
 			maxf = x[ix];
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(max+1);
 }
--- a/kernel/arm/imin.c
+++ b/kernel/arm/imin.c
@ -0,0 +1,65 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/08/19 Saar
 *	 BLASTEST float		
 * 	 BLASTEST double	
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	BLASLONG min=0;
 	if (n < 0 || inc_x < 1 ) return(min);
 	minf=x[0];
 	while(i < n)
 	{
 		if( x[ix] > minf ) 
 		{
 			min = i;
 			minf = x[ix];
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(min+1);
 }
--- a/kernel/arm/izamax.c
+++ b/kernel/arm/izamax.c
@ -0,0 +1,81 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 #define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf[2];
 	BLASLONG max=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(max);
 	inc_x2 = 2 * inc_x;
 	maxf[0] = ABS(x[ix]);
 	maxf[1] = ABS(x[ix+1]);
 	while(i < n)
 	{
 		if( CABS1(x,ix) > CABS1(maxf,0) ) 
 		{
 			max = i;
 			maxf[0] = ABS(x[ix]);
 			maxf[1] = ABS(x[ix+1]);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(max+1);
 }
--- a/kernel/arm/izamin.c
+++ b/kernel/arm/izamin.c
@ -0,0 +1,81 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 #define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf[2];
 	BLASLONG min=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(min);
 	inc_x2 = 2 * inc_x;
 	minf[0] = ABS(x[ix]);
 	minf[1] = ABS(x[ix+1]);
 	while(i < n)
 	{
 		if( CABS1(x,ix) < CABS1(minf,0) ) 
 		{
 			min = i;
 			minf[0] = ABS(x[ix]);
 			minf[1] = ABS(x[ix+1]);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(min+1);
 }
--- a/kernel/arm/max.c
+++ b/kernel/arm/max.c
@ -0,0 +1,63 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	if (n < 0 || inc_x < 1 ) return(maxf);
 	maxf=x[0];
 	while(i < n)
 	{
 		if( x[ix] > maxf ) 
 		{
 			maxf = x[ix];
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(maxf);
 }
--- a/kernel/arm/min.c
+++ b/kernel/arm/min.c
@ -0,0 +1,63 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: NoTest
 * 	 BLASTEST double	: NoTest
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	if (n < 0 || inc_x < 1 ) return(minf);
 	minf=x[0];
 	while(i < n)
 	{
 		if( x[ix] < minf ) 
 		{
 			minf = x[ix];
 		}
 		ix += inc_x;
 		i++;
 	}
 	return(minf);
 }
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@ -0,0 +1,88 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/13 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT scale = 0.0;
 	FLOAT ssq   = 1.0;
 	FLOAT absxi = 0.0;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 	n *= inc_x;
 	while(i < n)
 	{
 		if ( x[i] != 0.0 )
 		{
 			absxi = ABS( x[i] );
 			if ( scale < absxi )
 			{
 				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
 				scale = absxi ;
 			}
 			else
 			{
 				ssq += ( absxi/scale ) * ( absxi/scale );
 			}		
 		}
 		i += inc_x;
 	}
 	scale = scale * sqrt( ssq );
 	return(scale);
 }
--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@ -0,0 +1,565 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/22 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	b nrm2_begin
 #if	defined(COMPLEX)
 #if	defined(DOUBLE)
 znrm2_one:
 	.word	0x00000000
 	.word	0x3ff00000
 #else
 cnrm2_one:
 	.word	0x3f800000
 #endif
 #else
 #if	defined(DOUBLE)
 dnrm2_one:
 	.word	0x00000000
 	.word	0x3ff00000
 #else
 snrm2_one:
 	.word	0x3f800000
 #endif
 #endif
 	.align 5
 nrm2_begin:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0		// scale=0.0
 	vldr.64			d1 , znrm2_one		// ssq=1.0
 	vmov.f64		d7 , d1			// value 1.0 
 	vmov.f64		d6 , d0			// value 0.0 
 #else
 	vsub.f32                s0 , s0 , s0		// scale=0.0
 	vldr.32			s1 , cnrm2_one		// ssq=1.0
 	vmov.f32		s7 , s1			// value 1.0
 	vmov.f32		s6 , s0			// value 0.0 
 #endif
 #else
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0		// scale=0.0
 	vldr.64			d1 , dnrm2_one		// ssq=1.0
 	vmov.f64		d7 , d1			// value 1.0 
 	vmov.f64		d6 , d0			// value 0.0 
 #else
 	vsub.f32                s0 , s0 , s0		// scale=0.0
 	vldr.32			s1 , snrm2_one		// ssq=1.0
 	vmov.f32		s7 , s1			// value 1.0
 	vmov.f32		s6 , s0			// value 0.0 
 #endif
 #endif
 	cmp	N, #0
 	ble	nrm2_kernel_L999
 	cmp	INC_X, #0
 	beq	nrm2_kernel_L999
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asrs	I, N, #3				// I = N / 8
 	ble	nrm2_kernel_F1
 nrm2_kernel_F8:
 	KERNEL_F8
 	subs    I, I, #1
        bne     nrm2_kernel_F8
 nrm2_kernel_F1:
 	ands    I, N, #7
        ble     nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 nrm2_kernel_S1:
 	mov	I, N
 	.align 5
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 #if defined(DOUBLE)
 	vsqrt.f64	d1, d1
 	vmul.f64	d0, d0, d1
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
 #endif
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@ -0,0 +1,508 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/16 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0		// scale=0.0
 	vmov.f64		d1 , #1.0		// ssq=1.0
 	vmov.f64		d7 , d1			// value 1.0 
 	vmov.f64		d6 , d0			// value 0.0 
 #else
 	vsub.f32                s0 , s0 , s0		// scale=0.0
 	vmov.f32		s1 , #1.0		// ssq=1.0
 	vmov.f32		s7 , s1			// value 1.0
 	vmov.f32		s6 , s0			// value 0.0 
 #endif
 	cmp	N, #0
 	ble	nrm2_kernel_L999
 	cmp	INC_X, #0
 	beq	nrm2_kernel_L999
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asrs	I, N, #3				// I = N / 8
 	ble	nrm2_kernel_F1
 nrm2_kernel_F8:
 	KERNEL_F8
 	subs    I, I, #1
        bne     nrm2_kernel_F8
 nrm2_kernel_F1:
 	ands    I, N, #7
        ble     nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 nrm2_kernel_S1:
 	mov	I, N
 	.align 5
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 #if defined(DOUBLE)
 	vsqrt.f64	d1, d1
 	vmul.f64	d0, d0, d1
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
 #endif
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/rot.c
+++ b/kernel/arm/rot.c
@ -0,0 +1,62 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	FLOAT temp;
 	if ( n <= 0     )  return(0);
 	while(i < n)
 	{
 		temp   = c*x[ix] + s*y[iy] ;
 		y[iy]  = c*y[iy] - s*x[ix] ;
 		x[ix]  = temp ;
 		ix += inc_x ;
 		iy += inc_y ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@ -0,0 +1,584 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/15 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_INC_Y	[fp, #0 ]
 #define	N	r0
 #define X	r1
 #define	INC_X	r2
 #define	Y	r3
 #define INC_Y	r4
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 /*****************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X,  { d4 }
 	fldmiad	Y,  { d5 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	fstmiad	X, { d2 }
 	fstmiad	Y, { d3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 .endm
 .macro KERNEL_S1
 	fldmias	X,  { s4 }
 	fldmias	Y,  { s5 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	fstmias	X, { s2 }
 	fstmias	Y, { s3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X,  { d4 - d5 }
 	fldmiad	Y,  { d6 - d7 }
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vstr	    d2 , [ X, #0 ]
 	vstr	    d3 , [ Y, #0 ]
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vstr	    d2 , [ X, #8 ]
 	vstr	    d3 , [ Y, #8 ]
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 .endm
 .macro KERNEL_S1
 	fldmias	X,  { s4 - s5 }
 	fldmias	Y,  { s6 - s7 }
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vstr	    s2 , [ X, #0 ]
 	vstr	    s3 , [ Y, #0 ]
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vstr	    s2 , [ X, #4 ]
 	vstr	    s3 , [ Y, #4 ]
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 , fp}
        add     fp, sp, #8
 	ldr    INC_Y , OLD_INC_Y
 	cmp	N, #0
 	ble	rot_kernel_L999
 	cmp	INC_X, #0
 	beq	rot_kernel_L999
 	cmp	INC_Y, #0
 	beq	rot_kernel_L999
 	cmp	INC_X, #1
 	bne	rot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	rot_kernel_S_BEGIN
 rot_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	rot_kernel_F1
 	.align 5
 rot_kernel_F4:
 #if !defined(COMPLEX) && !defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 #endif
 	KERNEL_F4
 	subs	I, I, #1
 	ble	rot_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	rot_kernel_F4
 rot_kernel_F1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 rot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     rot_kernel_F10
 	b	rot_kernel_L999
 rot_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 #endif
 	asrs	I, N, #2					// I = N / 4
 	ble	rot_kernel_S1
 	.align 5
 rot_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	rot_kernel_S4
 rot_kernel_S1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 rot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     rot_kernel_S10
 rot_kernel_L999:
 	mov	r0, #0		// set return value
 	sub     sp, fp, #8
 	pop     {r4,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/scal.c
+++ b/kernel/arm/scal.c
@ -0,0 +1,58 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	if ( n < 0 || inc_x < 1     )  return(0);
 	if ( da == 1.0 ) return(0);
 	n *= inc_x;
 	while(i < n)
 	{
 		x[i] = da * x[i] ;
 		i += inc_x ;
 	}
 	return(0);
 }
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@ -0,0 +1,376 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/15 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_INC_X	[sp, #0 ]
 #define	N	r0
 #define	INC_X	r1
 #define	X	r3
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 /*****************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmiad	X,  { d4 - d7 }
 	vmul.f64    d4, d4, d0	
 	vmul.f64    d5, d5, d0	
 	vmul.f64    d6, d6, d0	
 	fstmiad	X!, { d4 - d5 }
 	vmul.f64    d7, d7, d0	
 	fstmiad	X!, { d6 - d7 }
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d4 }
 	vmul.f64    d4, d4, d0	
 	fstmiad	X!, { d4 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X,  { d4 }
 	vmul.f64    d4, d4, d0	
 	fstmiad	X,  { d4 }
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F4
 	fldmias	X,  { s4 - s7 }
 	vmul.f32    s4, s4, s0	
 	vmul.f32    s5, s5, s0	
 	vmul.f32    s6, s6, s0	
 	fstmias	X!, { s4 - s5 }
 	vmul.f32    s7, s7, s0	
 	fstmias	X!, { s6 - s7 }
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s4 }
 	vmul.f32    s4, s4, s0	
 	fstmias	X!, { s4 }
 .endm
 .macro KERNEL_S1
 	fldmias	X,  { s4 }
 	vmul.f32    s4, s4, s0	
 	fstmias	X,  { s4 }
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 	pld	[ X, #X_PRE ]
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4	
 	fnmacd     d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X, { d2 - d3 }
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
 .endm
 .macro KERNEL_S1
 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4	
 	fnmacs     s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X, { s2 - s3 }
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	ldr    INC_X , OLD_INC_X
 	cmp	N, #0
 	ble	scal_kernel_L999
 	cmp	INC_X, #0
 	ble	scal_kernel_L999
 	cmp	INC_X, #1
 	bne	scal_kernel_S_BEGIN
 scal_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	scal_kernel_F1
 	.align 5
 scal_kernel_F4:
 #if !defined(COMPLEX) && !defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 #endif
 	KERNEL_F4
 	subs	I, I, #1
 	ble	scal_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	scal_kernel_F4
 scal_kernel_F1:
 	ands	I, N, #3
 	ble	scal_kernel_L999
 scal_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     scal_kernel_F10
 	b	scal_kernel_L999
 scal_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 	asrs	I, N, #2					// I = N / 4
 	ble	scal_kernel_S1
 	.align 5
 scal_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	scal_kernel_S4
 scal_kernel_S1:
 	ands	I, N, #3
 	ble	scal_kernel_L999
 scal_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     scal_kernel_S10
 scal_kernel_L999:
 	mov	r0, #0		// set return value
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/scopy_vfp.S
+++ b/kernel/arm/scopy_vfp.S
@ -0,0 +1,224 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY_F8
 	pld	[ X, #X_PRE  ]
 	fldmias	X!, { s0 - s3 }
 	fldmias	X!, { s4 - s7 }
 	fstmias	Y!, { s0 - s3 }
 	fstmias	Y!, { s4 - s7 }
 .endm
 .macro COPY_F1
 	fldmias	X!, { s0 }
 	fstmias	Y!, { s0 }
 .endm
 /*************************************************************************************************************************/
 .macro COPY_S4
 	nop
 	fldmias	X, { s0 }
 	fstmias	Y, { s0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s1 }
 	fstmias	Y, { s1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s0 }
 	fstmias	Y, { s0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s1 }
 	fstmias	Y, { s1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro COPY_S1
 	fldmias	X, { s0 }
 	fstmias	Y, { s0 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	cmp	N, #0
 	ble	scopy_kernel_L999
 	cmp	INC_X, #0
 	beq	scopy_kernel_L999
 	cmp	INC_Y, #0
 	beq	scopy_kernel_L999
 	cmp	INC_X, #1
 	bne	scopy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	scopy_kernel_S_BEGIN
 scopy_kernel_F_BEGIN:
 	asrs	I, N, #3					// I = N / 8
 	ble	scopy_kernel_F1
 scopy_kernel_F8:
 	COPY_F8
 	subs	I, I, #1
 	bne	scopy_kernel_F8
 scopy_kernel_F1:
 	ands	I, N, #7
 	ble	scopy_kernel_L999
 scopy_kernel_F10:
 	COPY_F1
 	subs    I, I, #1
        bne     scopy_kernel_F10
 	b	scopy_kernel_L999
 scopy_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 	asrs	I, N, #2					// I = N / 4
 	ble	scopy_kernel_S1
 scopy_kernel_S4:
 	COPY_S4
 	subs	I, I, #1
 	bne	scopy_kernel_S4
 scopy_kernel_S1:
 	ands	I, N, #3
 	ble	scopy_kernel_L999
 scopy_kernel_S10:
 	COPY_S1
 	subs    I, I, #1
        bne     scopy_kernel_S10
 scopy_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@ -0,0 +1,347 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/11 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK (no test for dsdot)
 * 	 TEST			: OK (no test for dsdot)
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if defined(DSDOT)
 .macro KERNEL_F4
 	fldmias	X!, { s14 }
 	fldmias	Y!, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	fldmias	X!, { s14 }
 	fldmias	Y!, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	fldmias	X!, { s14 }
 	fldmias	Y!, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	fldmias	X!, { s14 }
 	fldmias	Y!, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s14 }
 	fldmias	Y!, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 .endm
 .macro KERNEL_S4
 	nop
 	fldmias	X, { s14 }
 	fldmias	Y, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s14 }
 	fldmias	Y, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s14 }
 	fldmias	Y, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmias	X, { s14 }
 	fldmias	Y, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s14 }
 	fldmias	Y, { s15 }
 	vmul.f32   s15, s14, s15
 	vcvt.f64.f32	d4, s15
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	fldmias	X!, { s8 - s9 }
 	fldmias	Y!, { s4 - s5}
 	fmacs   s0  , s4,  s8
 	fldmias	X!, { s10 - s11 }
 	fmacs   s1  , s5,  s9
 	fldmias	Y!, { s6 - s7 }
 	fmacs   s0  , s6,  s10
 	fmacs   s1  , s7,  s11
 .endm
 .macro KERNEL_F1
 	fldmias	X!, { s4 }
 	fldmias	Y!, { s8 }
 	fmacs   s0  , s4,  s8
 .endm
 .macro KERNEL_S4
 	nop
 	fldmias	X, { s4 }
 	fldmias	Y, { s8 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacs   s0  , s4,  s8
 	fldmias	X, { s5 }
 	fldmias	Y, { s9 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacs   s1  , s5,  s9
 	fldmias	X, { s6 }
 	fldmias	Y, { s10 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacs   s0  , s6,  s10
 	fldmias	X, { s7 }
 	fldmias	Y, { s11 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fmacs   s1  , s7,  s11
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s4 }
 	fldmias	Y, { s8 }
 	add	X, X, INC_X
 	fmacs   s0  , s4,  s8
 	add	Y, Y, INC_Y
 .endm
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15 } 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 #if	defined(DSDOT)
 	vsub.f64                d0 , d0 , d0
 	vsub.f64                d1 , d1 , d1
 #else
 	vsub.f32                s0 , s0 , s0
 	vsub.f32                s1 , s1 , s1
 #endif
 	cmp	N, #0
 	ble	sdot_kernel_L999
 	cmp	INC_X, #0
 	beq	sdot_kernel_L999
 	cmp	INC_Y, #0
 	beq	sdot_kernel_L999
 	cmp	INC_X, #1
 	bne	sdot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	sdot_kernel_S_BEGIN
 sdot_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	sdot_kernel_F1
 sdot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	sdot_kernel_F4
 sdot_kernel_F1:
 	ands	I, N, #3
 	ble	sdot_kernel_L999
 sdot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     sdot_kernel_F10
 	b	sdot_kernel_L999
 sdot_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 	asrs	I, N, #2					// I = N / 4
 	ble	sdot_kernel_S1
 sdot_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	sdot_kernel_S4
 sdot_kernel_S1:
 	ands	I, N, #3
 	ble	sdot_kernel_L999
 sdot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     sdot_kernel_S10
 sdot_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 #if	defined(DSDOT)
 	vadd.f64	d0 , d0, d1				// set return value
 #else
 	vadd.f32	s0 , s0, s1				// set return value
 #endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@ -0,0 +1,797 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/28 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_K	r2
 #define	OLD_A	r3
 #define OLD_ALPHA s0
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDC	[fp, #-252 ]
 #define M	[fp, #-256 ]
 #define N	[fp, #-260 ]
 #define K	[fp, #-264 ]
 #define A	[fp, #-268 ]
 #define ALPHA	[fp, #-280]
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define I	r0
 #define J	r1
 #define L	r2
 #define	AO	r5
 #define	BO	r6
 #define	CO1	r8
 #define	CO2	r9
 #define K1	r7
 #define BC	r12
 #define A_PRE	96
 #define B_PRE	96
 #define C_PRE	64
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro INIT4x2
 	vsub.f32		s8 , s8 , s8
 	vmov.f32		s9, s8
 	vmov.f32		s10, s8
 	vmov.f32		s11, s8
 	vmov.f32		s12, s8
 	vmov.f32		s13, s8
 	vmov.f32		s14, s8
 	vmov.f32		s15, s8
 .endm
 .macro KERNEL4x2_SUB
 	fldmias	AO! , { s0 - s3 }
 	fldmias	BO! , { s4 - s5 }
 	fmacs	s8  , s0,  s4
 	fmacs	s9  , s1,  s4
 	fmacs	s10  , s2,  s4
 	fmacs	s11  , s3,  s4
 	fmacs	s12  , s0,  s5
 	fmacs	s13  , s1,  s5
 	fmacs	s14  , s2,  s5
 	fmacs	s15  , s3,  s5
 .endm
 .macro SAVE4x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	flds	s5 , [CO1, #4 ]
 	flds	s6 , [CO1, #8 ]
 	flds	s7 , [CO1, #12 ]
 	fmacs	s4 , s0 , s8
 	fmacs	s5 , s0 , s9
 	fmacs	s6 , s0 , s10
 	fmacs	s7 , s0 , s11
 	fsts	s4 , [CO1]
 	fsts	s5 , [CO1, #4 ]
 	fsts	s6 , [CO1, #8 ]
 	fsts	s7 , [CO1, #12 ]
 	flds	s4 , [CO2]
 	flds	s5 , [CO2, #4 ]
 	flds	s6 , [CO2, #8 ]
 	flds	s7 , [CO2, #12 ]
 	fmacs	s4 , s0 , s12
 	fmacs	s5 , s0 , s13
 	fmacs	s6 , s0 , s14
 	fmacs	s7 , s0 , s15
 	fsts	s4 , [CO2]
 	fsts	s5 , [CO2, #4 ]
 	fsts	s6 , [CO2, #8 ]
 	fsts	s7 , [CO2, #12 ]
 	add	CO1, CO1, #16
 .endm
 /******************************************************************************/
 .macro INIT2x2
 	vsub.f32		s8 , s8 , s8
 	vmov.f32		s9, s8
 	vmov.f32		s12, s8
 	vmov.f32		s13, s8
 .endm
 .macro KERNEL2x2_SUB
 	flds	s4 , [ BO ]
 	flds	s5 , [ BO, #4 ]
 	flds	s0 , [ AO ]
 	flds	s1 , [ AO, #4 ]
 	fmacs	s8  , s0,  s4
 	fmacs	s9  , s1,  s4
 	fmacs	s12  , s0,  s5
 	fmacs	s13  , s1,  s5
 	add	AO , AO, #8
 	add	BO , BO, #8
 .endm
 .macro SAVE2x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	flds	s5 , [CO1, #4 ]
 	fmacs	s4 , s0 , s8
 	fmacs	s5 , s0 , s9
 	fsts	s4 , [CO1]
 	fsts	s5 , [CO1, #4 ]
 	flds	s4 , [CO2]
 	flds	s5 , [CO2, #4 ]
 	fmacs	s4 , s0 , s12
 	fmacs	s5 , s0 , s13
 	fsts	s4 , [CO2]
 	fsts	s5 , [CO2, #4 ]
 	add	CO1, CO1, #8
 .endm
 /******************************************************************************/
 .macro INIT1x2
 	vsub.f32		s8 , s8 , s8
 	vmov.f32		s12, s8
 .endm
 .macro KERNEL1x2_SUB
 	flds	s4 , [ BO ]
 	flds	s5 , [ BO, #4 ]
 	flds	s0 , [ AO ]
 	fmacs	s8  , s0,  s4
 	fmacs	s12  , s0,  s5
 	add	AO , AO, #4
 	add	BO , BO, #8
 .endm
 .macro SAVE1x2
 	ldr	r3  , LDC
 	add	CO2 , CO1, r3
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	fmacs	s4 , s0 , s8
 	fsts	s4 , [CO1]
 	flds	s4 , [CO2]
 	fmacs	s4 , s0 , s12
 	fsts	s4 , [CO2]
 	add	CO1, CO1, #4
 .endm
 /******************************************************************************/
 .macro INIT4x1
 	vsub.f32		s8 , s8 , s8
 	vmov.f32		s9, s8
 	vmov.f32		s10, s8
 	vmov.f32		s11, s8
 .endm
 .macro KERNEL4x1_SUB
 	flds	s4 , [ BO ]
 	flds	s0 , [ AO ]
 	flds	s1 , [ AO, #4 ]
 	flds	s2 , [ AO, #8 ]
 	flds	s3 , [ AO, #12 ]
 	fmacs	s8  , s0,  s4
 	fmacs	s9  , s1,  s4
 	fmacs	s10 , s2,  s4
 	fmacs	s11 , s3,  s4
 	add	AO , AO, #16
 	add	BO , BO, #4
 .endm
 .macro SAVE4x1
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	flds	s5 , [CO1, #4 ]
 	flds	s6 , [CO1, #8 ]
 	flds	s7 , [CO1, #12 ]
 	fmacs	s4 , s0 , s8
 	fmacs	s5 , s0 , s9
 	fmacs	s6 , s0 , s10
 	fmacs	s7 , s0 , s11
 	fsts	s4 , [CO1]
 	fsts	s5 , [CO1, #4 ]
 	fsts	s6 , [CO1, #8 ]
 	fsts	s7 , [CO1, #12 ]
 	add	CO1, CO1, #16
 .endm
 /******************************************************************************/
 .macro INIT2x1
 	vsub.f32		s8 , s8 , s8
 	vmov.f32		s9 , s8
 .endm
 .macro KERNEL2x1_SUB
 	flds	s4 , [ BO ]
 	flds	s0 , [ AO ]
 	flds	s1 , [ AO, #4 ]
 	fmacs	s8  , s0,  s4
 	fmacs	s9  , s1,  s4
 	add	AO , AO, #8
 	add	BO , BO, #4
 .endm
 .macro SAVE2x1
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	flds	s5 , [CO1, #4 ]
 	fmacs	s4 , s0 , s8
 	fmacs	s5 , s0 , s9
 	fsts	s4 , [CO1]
 	fsts	s5 , [CO1, #4 ]
 	add	CO1, CO1, #8
 .endm
 /******************************************************************************/
 .macro INIT1x1
 	vsub.f32		s8 , s8 , s8
 .endm
 .macro KERNEL1x1_SUB
 	flds	s4 , [ BO ]
 	flds	s0 , [ AO ]
 	fmacs	s8  , s0,  s4
 	add	AO , AO, #4
 	add	BO , BO, #4
 .endm
 .macro SAVE1x1
 	flds		s0, ALPHA
 	flds	s4 , [CO1]
 	fmacs	s4 , s0 , s8
 	fsts	s4 , [CO1]
 	add	CO1, CO1, #4
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
 	str	OLD_A, A
 	vstr	OLD_ALPHA, ALPHA
 	sub	r3, fp, #128
 	vstm	r3, { s8 - s15} 				// store floating point registers
 	ldr	r3, OLD_LDC
 	lsl	r3, r3, #2					// ldc = ldc * 4
 	str	r3, LDC
 	ldr	K1, K
 	ldr	BC, B
 	ldr	J, N
 	asrs	J, J, #1					// J = J / 2
 	ble	sgemm_kernel_L1_BEGIN
 /*********************************************************************************************/
 sgemm_kernel_L2_BEGIN:
 	ldr	CO1, C						// CO1 = C
 	ldr	r4 , LDC
 	lsl	r4 , r4 , #1					// LDC * 2
 	add	r3 , r4, CO1
 	str	r3 , C						// store C
 	ldr	AO, A						// AO = A
 sgemm_kernel_L2_M4_BEGIN:
 	ldr	I, M
 	asrs	I, I, #2					// I = I / 4
 	ble	sgemm_kernel_L2_M2_BEGIN
 sgemm_kernel_L2_M4_20:
 	INIT4x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L2_M4_40
 	.align 5
 sgemm_kernel_L2_M4_22:
 	pld [ AO, #A_PRE ]
 	pld [ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld [ AO, #A_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld [ AO, #A_PRE ]
 	pld [ BO, #B_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	pld [ AO, #A_PRE ]
 	KERNEL4x2_SUB
 	KERNEL4x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M4_22
 sgemm_kernel_L2_M4_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L2_M4_100
 sgemm_kernel_L2_M4_42:
 	KERNEL4x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M4_42
 sgemm_kernel_L2_M4_100:
 	SAVE4x2
 sgemm_kernel_L2_M4_END:
 	subs	I, I, #1
 	bgt	sgemm_kernel_L2_M4_20
 sgemm_kernel_L2_M2_BEGIN:
 	ldr	I, M
 	tst	I , #3
 	ble	sgemm_kernel_L2_END
 	tst	I, #2					// I = I / 2
 	ble	sgemm_kernel_L2_M1_BEGIN
 sgemm_kernel_L2_M2_20:
 	INIT2x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L2_M2_40
 sgemm_kernel_L2_M2_22:
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	KERNEL2x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M2_22
 sgemm_kernel_L2_M2_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L2_M2_100
 sgemm_kernel_L2_M2_42:
 	KERNEL2x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M2_42
 sgemm_kernel_L2_M2_100:
 	SAVE2x2
 sgemm_kernel_L2_M2_END:
 sgemm_kernel_L2_M1_BEGIN:
 	tst	I, #1					// I = I % 2
 	ble	sgemm_kernel_L2_END
 sgemm_kernel_L2_M1_20:
 	INIT1x2
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L2_M1_40
 sgemm_kernel_L2_M1_22:
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	KERNEL1x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M1_22
 sgemm_kernel_L2_M1_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L2_M1_100
 sgemm_kernel_L2_M1_42:
 	KERNEL1x2_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L2_M1_42
 sgemm_kernel_L2_M1_100:
 	SAVE1x2
 sgemm_kernel_L2_END:
 	mov	r3, BC
 	mov	r4, K1
 	lsl	r4, r4, #3					// k * 2 * 4
 	add	r3, r3, r4					// B = B + K * 2 * 4
 	mov	BC, r3
 	subs	J , #1						// j--
 	bgt	sgemm_kernel_L2_BEGIN
 /*********************************************************************************************/
 sgemm_kernel_L1_BEGIN:
 	ldr	J , N
 	tst	J , #1
 	ble	sgemm_kernel_L999
 	ldr	CO1, C						// CO1 = C
 	ldr	r4 , LDC
 	add	r3 , r4, CO1
 	str	r3 , C						// store C
 	ldr	AO, A						// AO = A
 sgemm_kernel_L1_M4_BEGIN:
 	ldr	I, M
 	asrs	I, I, #2					// I = I / 4
 	ble	sgemm_kernel_L1_M2_BEGIN
 sgemm_kernel_L1_M4_20:
 	INIT4x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L1_M4_40
 	.align 5
 sgemm_kernel_L1_M4_22:
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	KERNEL4x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M4_22
 sgemm_kernel_L1_M4_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L1_M4_100
 sgemm_kernel_L1_M4_42:
 	KERNEL4x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M4_42
 sgemm_kernel_L1_M4_100:
 	SAVE4x1
 sgemm_kernel_L1_M4_END:
 	subs	I, I, #1
 	bgt	sgemm_kernel_L1_M4_20
 sgemm_kernel_L1_M2_BEGIN:
 	ldr	I, M
 	tst	I , #3
 	ble	sgemm_kernel_L1_END
 	tst	I, #2					// I = I / 2
 	ble	sgemm_kernel_L1_M1_BEGIN
 sgemm_kernel_L1_M2_20:
 	INIT2x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L1_M2_40
 sgemm_kernel_L1_M2_22:
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	KERNEL2x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M2_22
 sgemm_kernel_L1_M2_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L1_M2_100
 sgemm_kernel_L1_M2_42:
 	KERNEL2x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M2_42
 sgemm_kernel_L1_M2_100:
 	SAVE2x1
 sgemm_kernel_L1_M2_END:
 sgemm_kernel_L1_M1_BEGIN:
 	tst	I, #1					// I = I % 2
 	ble	sgemm_kernel_L1_END
 sgemm_kernel_L1_M1_20:
 	INIT1x1
 	mov	BO, BC
 	asrs	L , K1, #3					// L = L / 8
 	ble	sgemm_kernel_L1_M1_40
 sgemm_kernel_L1_M1_22:
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	KERNEL1x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M1_22
 sgemm_kernel_L1_M1_40:
 	ands	L , K1, #7					// L = L % 8
 	ble	sgemm_kernel_L1_M1_100
 sgemm_kernel_L1_M1_42:
 	KERNEL1x1_SUB
 	subs	L, L, #1
 	bgt	sgemm_kernel_L1_M1_42
 sgemm_kernel_L1_M1_100:
 	SAVE1x1
 sgemm_kernel_L1_END:
 sgemm_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
--- a/kernel/arm/sgemm_ncopy_2_vfp.S
+++ b/kernel/arm/sgemm_ncopy_2_vfp.S
@ -0,0 +1,225 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/24 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define	LDA	r8
 #define I	r3
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	flds	s0 , [ AO1, #0  ]
 	flds	s2 , [ AO1, #4  ]
 	flds	s1 , [ AO2, #0  ]
 	flds	s3 , [ AO2, #4  ]
 	add	AO1, AO1, #8
 	fstmias	BO!, { s0 - s3 }
 	add	AO2, AO2, #8
 .endm
 .macro COPY1x2
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO2, #0  ]
 	add	AO1, AO1, #4
 	fstmias	BO!, { s0 - s1 }
 	add	AO2, AO2, #4
 .endm
 .macro COPY2x1
 	flds	s0 , [ AO1, #0  ]
 	flds	s1 , [ AO1, #4  ]
 	fstmias	BO!, { s0 - s1 }
 	add	AO1, AO1, #8
 .endm
 .macro COPY1x1
 	flds	s0 , [ AO1, #0  ]
 	fstmias	BO!, { s0 }
 	add	AO1, AO1, #4
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	lsl	LDA, OLD_LDA, #2					// lda = lda * 4
 	ldr	BO, B	
 /*********************************************************************************************/
 sgemm_ncopy_L2_BEGIN:
 	asrs	J, N, #1					// J = N / 2
 	ble	sgemm_ncopy_L1_BEGIN
 sgemm_ncopy_L2_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	add	AO2, AO1, LDA
 	add	A  , AO2, LDA 					// A = A + 2 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	sgemm_ncopy_L2_M2_40
 sgemm_ncopy_L2_M2_20:
 	COPY2x2
 	subs	I , I , #1
 	bne	sgemm_ncopy_L2_M2_20
 sgemm_ncopy_L2_M2_40:
 	ands	I, M , #1
 	ble	sgemm_ncopy_L2_M2_END
 sgemm_ncopy_L2_M2_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	sgemm_ncopy_L2_M2_60
 sgemm_ncopy_L2_M2_END:
 	subs	J , J, #1						// j--
 	bne	sgemm_ncopy_L2_M2_BEGIN
 /*********************************************************************************************/
 sgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	sgemm_ncopy_L999
 sgemm_ncopy_L1_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	add	A  , AO1, LDA 					// A = A + 1 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	sgemm_ncopy_L1_M2_40
 sgemm_ncopy_L1_M2_20:
 	COPY2x1
 	subs	I , I , #1
 	bne	sgemm_ncopy_L1_M2_20
 sgemm_ncopy_L1_M2_40:
 	ands	I, M , #1
 	ble	sgemm_ncopy_L1_M2_END
 sgemm_ncopy_L1_M2_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	sgemm_ncopy_L1_M2_60
 sgemm_ncopy_L1_M2_END:
 sgemm_ncopy_L999:
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/sgemm_ncopy_4_vfp.S
+++ b/kernel/arm/sgemm_ncopy_4_vfp.S
@ -0,0 +1,353 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/05 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDA	[fp, #-260 ]
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define	AO3	r8
 #define	AO4	r9
 #define I	r3
 #define	J	r12
 #define A_PRE	192
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY4x4
 	flds s0 , [ AO1, #0  ]
 	flds s1 , [ AO2, #0  ]
 	flds s2 , [ AO3, #0  ]
 	flds s3 , [ AO4, #0  ]
 	flds s4 , [ AO1, #4  ]
 	flds s8 , [ AO1, #8 ]
 	flds s12, [ AO1, #12 ]
 	flds s5 , [ AO2, #4  ]
 	add	AO1, AO1, #16
 	flds s9 , [ AO2, #8 ]
 	flds s13, [ AO2, #12 ]
 	flds s6 , [ AO3, #4  ]
 	add	AO2, AO2, #16
 	flds s10, [ AO3, #8 ]
 	flds s14, [ AO3, #12 ]
 	flds s7 , [ AO4, #4  ]
 	add	AO3, AO3, #16
 	flds s11, [ AO4, #8 ]
 	flds s15, [ AO4, #12 ]
 	fstmias	BO!, { s0 - s3 }
 	add	AO4, AO4, #16
 	fstmias	BO!, { s4 - s7 }
 	fstmias	BO!, { s8 - s15 }
 .endm
 .macro COPY1x4
 	flds s0 , [ AO1, #0  ]
 	flds s1 , [ AO2, #0  ]
 	add	AO1, AO1, #4
 	flds s2 , [ AO3, #0  ]
 	add	AO2, AO2, #4
 	flds s3 , [ AO4, #0  ]
 	add	AO3, AO3, #4
 	fstmias	BO!, { s0 - s3 }
 	add	AO4, AO4, #4
 .endm
 .macro COPY4x2
 	flds s0 , [ AO1, #0  ]
 	flds s2 , [ AO1, #4  ]
 	flds s4 , [ AO1, #8 ]
 	flds s6 , [ AO1, #12 ]
 	flds s1 , [ AO2, #0  ]
 	flds s3 , [ AO2, #4  ]
 	add	AO1, AO1, #16
 	flds s5 , [ AO2, #8 ]
 	flds s7 , [ AO2, #12 ]
 	fstmias	BO!, { s0 - s7 }
 	add	AO2, AO2, #16
 .endm
 .macro COPY1x2
 	flds s0 , [ AO1, #0  ]
 	flds s1 , [ AO2, #0  ]
 	add	AO1, AO1, #4
 	fstmias	BO!, { s0 - s1 }
 	add	AO2, AO2, #4
 .endm
 .macro COPY4x1
 	flds s0 , [ AO1, #0  ]
 	flds s1 , [ AO1, #4  ]
 	flds s2 , [ AO1, #8 ]
 	flds s3 , [ AO1, #12 ]
 	fstmias	BO!, { s0 - s3 }
 	add	AO1, AO1, #16
 .endm
 .macro COPY1x1
 	flds s0 , [ AO1, #0  ]
 	fstmias	BO!, { s0 }
 	add	AO1, AO1, #4
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	lsl	r3, r3, #2					// lda = lda * 4
 	str	r3, LDA
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	ldr	BO, B	
 sgemm_ncopy_L4_BEGIN:
 	asrs	J, N, #2					// J = N / 4
 	ble	sgemm_ncopy_L2_BEGIN
 sgemm_ncopy_L4_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	AO3, AO2, r4
 	add	AO4, AO3, r4
 	add	A  , AO4, r4					// A = A + 4 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	sgemm_ncopy_L4_M4_40
 sgemm_ncopy_L4_M4_20:
 	pld	[ AO1, #A_PRE ]
 	pld	[ AO2, #A_PRE ]
 	pld	[ AO3, #A_PRE ]
 	pld	[ AO4, #A_PRE ]
 	COPY4x4
 	subs	I , I , #1
 	ble	sgemm_ncopy_L4_M4_40
 	COPY4x4
 	subs	I , I , #1
 	bne	sgemm_ncopy_L4_M4_20
 sgemm_ncopy_L4_M4_40:
 	ands	I, M , #3
 	ble	sgemm_ncopy_L4_M4_END
 sgemm_ncopy_L4_M4_60:
 	COPY1x4
 	subs	I , I , #1
 	bne	sgemm_ncopy_L4_M4_60
 sgemm_ncopy_L4_M4_END:
 	subs	J , J, #1						// j--
 	bne	sgemm_ncopy_L4_M4_BEGIN
 /*********************************************************************************************/
 sgemm_ncopy_L2_BEGIN:
 	tst	N, #3
 	ble	sgemm_ncopy_L999
 	tst	N, #2
 	ble	sgemm_ncopy_L1_BEGIN
 sgemm_ncopy_L2_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	A  , AO2, r4 					// A = A + 2 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	sgemm_ncopy_L2_M4_40
 sgemm_ncopy_L2_M4_20:
 	COPY4x2
 	subs	I , I , #1
 	bne	sgemm_ncopy_L2_M4_20
 sgemm_ncopy_L2_M4_40:
 	ands	I, M , #3
 	ble	sgemm_ncopy_L2_M4_END
 sgemm_ncopy_L2_M4_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	sgemm_ncopy_L2_M4_60
 sgemm_ncopy_L2_M4_END:
 /*********************************************************************************************/
 sgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	sgemm_ncopy_L999
 sgemm_ncopy_L1_M4_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	A  , AO1, r4 					// A = A + 1 * LDA
 	asrs	I, M, #2					// I = M / 4
 	ble	sgemm_ncopy_L1_M4_40
 sgemm_ncopy_L1_M4_20:
 	COPY4x1
 	subs	I , I , #1
 	bne	sgemm_ncopy_L1_M4_20
 sgemm_ncopy_L1_M4_40:
 	ands	I, M , #3
 	ble	sgemm_ncopy_L1_M4_END
 sgemm_ncopy_L1_M4_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	sgemm_ncopy_L1_M4_60
 sgemm_ncopy_L1_M4_END:
 sgemm_ncopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/sgemm_tcopy_4_vfp.S
+++ b/kernel/arm/sgemm_tcopy_4_vfp.S
@ -0,0 +1,430 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/06 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define B	[fp, #4 ]
 #define A	[fp, #-248 ]
 #define M	r0
 #define N	r1
 #define M4	r2
 #define	LDA	r5
 #define	AO1	r6
 #define	BO1	r7
 #define	BO2	r8
 #define	BO3	r9
 #define I	r4
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY4x4_1
 	pld	[ AO1, #A_PRE  ]
 	fldmias	AO1, { s0 - s3 }
 	add	r3, AO1, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmias	r3, { s4 - s7 }
 	add	r3, r3, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmias	r3, { s8 - s11 }
 	add	r3, r3, LDA
 	pld	[ r3, #A_PRE  ]
 	fldmias	r3, { s12 - s15 }
 	fstmias	BO1, { s0 - s15 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY4x4_2
 	fldmias	AO1, { s0 - s3 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s4 - s7 }
 	add	r3, r3, LDA
 	fldmias	r3, { s8 - s11 }
 	add	r3, r3, LDA
 	fldmias	r3, { s12 - s15 }
 	fstmias	BO1, { s0 - s15 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x4
 	fldmias	AO1, { s0 - s1 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s2 - s3 }
 	add	r3, r3, LDA
 	fldmias	r3, { s4 - s5 }
 	add	r3, r3, LDA
 	fldmias	r3, { s6 - s7 }
 	fstmias	BO2, { s0 - s7 }
 	add	AO1, AO1, #8
 	add	BO2, BO2, #32
 .endm
 .macro COPY1x4
 	fldmias	AO1, { s0 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s1 }
 	add	r3, r3, LDA
 	fldmias	r3, { s2 }
 	add	r3, r3, LDA
 	fldmias	r3, { s3 }
 	fstmias	BO3, { s0 - s3 }
 	add	AO1, AO1, #4
 	add	BO3, BO3, #16
 .endm
 /*************************************************************************************************************************/
 .macro COPY4x2
 	fldmias	AO1, { s0 - s3 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s4 - s7 }
 	fstmias	BO1, { s0 - s7 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x2
 	fldmias	AO1, { s0 - s1 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s2 - s3 }
 	fstmias	BO2, { s0 - s3 }
 	add	AO1, AO1, #8
 	add	BO2, BO2, #16
 .endm
 .macro COPY1x2
 	fldmias	AO1, { s0 }
 	add	r3, AO1, LDA
 	fldmias	r3, { s1 }
 	fstmias	BO3, { s0 - s1 }
 	add	AO1, AO1, #4
 	add	BO3, BO3, #8
 .endm
 /*************************************************************************************************************************/
 .macro COPY4x1
 	fldmias	AO1, { s0 - s3 }
 	fstmias	BO1, { s0 - s3 }
 	add	AO1, AO1, #16
 	add	BO1, BO1, M4
 .endm
 .macro COPY2x1
 	fldmias	AO1, { s0 - s1 }
 	fstmias	BO2, { s0 - s1 }
 	add	AO1, AO1, #8
 	add	BO2, BO2, #8
 .endm
 .macro COPY1x1
 	fldmias	AO1, { s0 }
 	fstmias	BO3, { s0 }
 	add	AO1, AO1, #4
 	add	BO3, BO3, #4
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_A, A					// store A
 	lsl	LDA, OLD_LDA, #2				// lda = lda * SIZE
 	sub	r4, fp, #128
 	vstm	r4, { s8 - s15} 				// store floating point registers
 	lsl	r4 , M, #2					// M * SIZE
 	ldr	r3, B
 	and	BO2 , N , #-4
 	and	BO3 , N , #-2
 	mul	BO2, BO2, r4
 	mul	BO3, BO3, r4
 	add	BO2 , BO2, r3
 	add	BO3 , BO3, r3
 	lsl	M4, M, #4					// M4 = M * 4 * SIZE
 sgemm_tcopy_L4_BEGIN:
 	asrs	J, M, #2					// J = N / 4
 	ble	sgemm_tcopy_L2_BEGIN
 sgemm_tcopy_L4_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #2					// r3 = 4 * LDA
 	add	r3, r3 , AO1					// A = A + 4 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #64					// B = B + 16 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	sgemm_tcopy_L4_M4_40
 sgemm_tcopy_L4_M4_20:
 	COPY4x4_1
 	subs	I , I , #1
 	ble	sgemm_tcopy_L4_M4_40
 	COPY4x4_2
 	subs	I , I , #1
 	bne	sgemm_tcopy_L4_M4_20
 sgemm_tcopy_L4_M4_40:
 	tst	N , #2
 	ble	sgemm_tcopy_L4_M4_60
 	COPY2x4
 sgemm_tcopy_L4_M4_60:
 	tst	N, #1
 	ble	sgemm_tcopy_L4_M4_END
 	COPY1x4
 sgemm_tcopy_L4_M4_END:
 	subs	J , J, #1						// j--
 	bne	sgemm_tcopy_L4_M4_BEGIN
 /*********************************************************************************************/
 sgemm_tcopy_L2_BEGIN:
 	tst	M, #3
 	ble	sgemm_tcopy_L999
 	tst	M, #2
 	ble	sgemm_tcopy_L1_BEGIN
 sgemm_tcopy_L2_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #1					// r3 = 2 * LDA
 	add	r3, r3 , AO1					// A = A + 2 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #32					// B = B + 8 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	sgemm_tcopy_L2_M4_40
 sgemm_tcopy_L2_M4_20:
 	COPY4x2
 	subs	I , I , #1
 	bne	sgemm_tcopy_L2_M4_20
 sgemm_tcopy_L2_M4_40:
 	tst	N , #2
 	ble	sgemm_tcopy_L2_M4_60
 	COPY2x2
 sgemm_tcopy_L2_M4_60:
 	tst	N , #1
 	ble	sgemm_tcopy_L2_M4_END
 	COPY1x2
 sgemm_tcopy_L2_M4_END:
 /*********************************************************************************************/
 sgemm_tcopy_L1_BEGIN:
 	tst	M, #1
 	ble	sgemm_tcopy_L999
 sgemm_tcopy_L1_M4_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	add	r3, LDA , AO1					// A = A + 1 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #16					// B = B + 4 * SIZE
 	str	r3, B
 	asrs	I, N, #2					// I = M / 4
 	ble	sgemm_tcopy_L1_M4_40
 sgemm_tcopy_L1_M4_20:
 	COPY4x1
 	subs	I , I , #1
 	bne	sgemm_tcopy_L1_M4_20
 sgemm_tcopy_L1_M4_40:
 	tst	N , #2
 	ble	sgemm_tcopy_L1_M4_60
 	COPY2x1
 sgemm_tcopy_L1_M4_60:
 	tst	N , #1
 	ble	sgemm_tcopy_L1_M4_END
 	COPY1x1
 sgemm_tcopy_L1_M4_END:
 sgemm_tcopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
--- a/kernel/arm/swap.c
+++ b/kernel/arm/swap.c
@ -0,0 +1,62 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/08/20 Saar
 *	 BLASTEST float		OK
 * 	 BLASTEST double	OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <stdio.h>
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	FLOAT temp;
 	if ( n < 0     )  return(0);
 	while(i < n)
 	{
 		temp  = x[ix] ;
 		x[ix] = y[iy] ;
 		y[iy] = temp ;
 		ix += inc_x ;
 		iy += inc_y ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/swap_vfp.S
+++ b/kernel/arm/swap_vfp.S
@ -0,0 +1,354 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/14 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
 #define	N	r0
 #define Y	r1
 #define	INC_X	r2
 #define	X	r3
 #define INC_Y	r4
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 /*****************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d0 - d3 }
 	fldmiad	Y,  { d4 - d7 }
 	fstmiad	Y!, { d0 - d3 }
 	fstmiad	X!, { d4 - d7}
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d0 }
 	fldmiad	Y,  { d4 }
 	fstmiad	Y!, { d0 }
 	fstmiad	X!, { d4 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d0 }
 	fldmiad	Y, { d4 }
 	fstmiad	Y, { d0 }
 	fstmiad	X, { d4 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	fldmias	X,  { s0 - s3 }
 	fldmias	Y,  { s4 - s7 }
 	fstmias	Y!, { s0 - s3 }
 	fstmias	X!, { s4 - s7}
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s0 }
 	fldmias	Y,  { s4 }
 	fstmias	Y!, { s0 }
 	fstmias	X!, { s4 }
 .endm
 .macro KERNEL_S1
 	fldmias	X, { s0 }
 	fldmias	Y, { s4 }
 	fstmias	Y, { s0 }
 	fstmias	X, { s4 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d0 - d3 }
 	fldmiad	Y,  { d4 - d7 }
 	fstmiad	Y!, { d0 - d3 }
 	fstmiad	X!, { d4 - d7}
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmiad	X,  { d0 - d3 }
 	fldmiad	Y,  { d4 - d7 }
 	fstmiad	Y!, { d0 - d3 }
 	fstmiad	X!, { d4 - d7}
 .endm
 .macro KERNEL_F1
 	fldmiad	X,  { d0 - d1 }
 	fldmiad	Y,  { d4 - d5 }
 	fstmiad	Y!, { d0 - d1 }
 	fstmiad	X!, { d4 - d5 }
 .endm
 .macro KERNEL_S1
 	fldmiad	X,  { d0 - d1 }
 	fldmiad	Y,  { d4 - d5 }
 	fstmiad	Y,  { d0 - d1 }
 	fstmiad	X,  { d4 - d5 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #else
 .macro KERNEL_F4
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 	fldmias	X,  { s0 - s3 }
 	fldmias	Y,  { s4 - s7 }
 	fstmias	Y!, { s0 - s3 }
 	fstmias	X!, { s4 - s7}
 	fldmias	X,  { s0 - s3 }
 	fldmias	Y,  { s4 - s7 }
 	fstmias	Y!, { s0 - s3 }
 	fstmias	X!, { s4 - s7}
 .endm
 .macro KERNEL_F1
 	fldmias	X,  { s0 - s1 }
 	fldmias	Y,  { s4 - s5 }
 	fstmias	Y!, { s0 - s1 }
 	fstmias	X!, { s4 - s5 }
 .endm
 .macro KERNEL_S1
 	fldmias	X,  { s0 - s1 }
 	fldmias	Y,  { s4 - s5 }
 	fstmias	Y,  { s0 - s1 }
 	fstmias	X,  { s4 - s5 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 , fp}
        add     fp, sp, #8
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
 	cmp	N, #0
 	ble	swap_kernel_L999
 	cmp	INC_X, #0
 	beq	swap_kernel_L999
 	cmp	INC_Y, #0
 	beq	swap_kernel_L999
 	cmp	INC_X, #1
 	bne	swap_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	swap_kernel_S_BEGIN
 swap_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	swap_kernel_F1
 	.align 5
 swap_kernel_F4:
 #if !defined(COMPLEX) && !defined(DOUBLE)
 	pld	[ X, #X_PRE ]
 	pld	[ Y, #X_PRE ]
 #endif
 	KERNEL_F4
 	subs	I, I, #1
 	ble	swap_kernel_F1
 	KERNEL_F4
 	subs	I, I, #1
 	bne	swap_kernel_F4
 swap_kernel_F1:
 	ands	I, N, #3
 	ble	swap_kernel_L999
 swap_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     swap_kernel_F10
 	b	swap_kernel_L999
 swap_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
 #endif
 #endif
 	asrs	I, N, #2					// I = N / 4
 	ble	swap_kernel_S1
 	.align 5
 swap_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	swap_kernel_S4
 swap_kernel_S1:
 	ands	I, N, #3
 	ble	swap_kernel_L999
 swap_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     swap_kernel_S10
 swap_kernel_L999:
 	mov	r0, #0		// set return value
 	sub     sp, fp, #8
 	pop     {r4,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zamax.c
+++ b/kernel/arm/zamax.c
@ -0,0 +1,81 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 #define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf[2];
 	BLASLONG max=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	inc_x2 = 2 * inc_x;
 	maxf[0] = ABS(x[ix]);
 	maxf[1] = ABS(x[ix+1]);
 	while(i < n)
 	{
 		if( CABS1(x,ix) > CABS1(maxf,0) ) 
 		{
 			max = i;
 			maxf[0] = ABS(x[ix]);
 			maxf[1] = ABS(x[ix+1]);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(CABS1(maxf,0));
 }
--- a/kernel/arm/zamin.c
+++ b/kernel/arm/zamin.c
@ -0,0 +1,81 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: NoTest
 * 	 TEST			: NoTest
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 #define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf[2];
 	BLASLONG min=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	inc_x2 = 2 * inc_x;
 	minf[0] = ABS(x[ix]);
 	minf[1] = ABS(x[ix+1]);
 	while(i < n)
 	{
 		if( CABS1(x,ix) < CABS1(minf,0) ) 
 		{
 			min = i;
 			minf[0] = ABS(x[ix]);
 			minf[1] = ABS(x[ix+1]);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(CABS1(minf,0));
 }
--- a/kernel/arm/zasum.c
+++ b/kernel/arm/zasum.c
@ -0,0 +1,71 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 #define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(sumf);
 	inc_x2 = 2 * inc_x;
 	n *= inc_x2;
 	while(i < n)
 	{
 		sumf += CABS1(x,i);
 		i += inc_x2;
 	}
 	return(sumf);
 }
--- a/kernel/arm/zaxpy.c
+++ b/kernel/arm/zaxpy.c
@ -0,0 +1,72 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/15 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 	if ( n < 0     )  return(0);
 	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
 	ix = 0;
 	iy = 0;
 	BLASLONG inc_x2 = 2 * inc_x;
 	BLASLONG inc_y2 = 2 * inc_y;
 	while(i < n)
 	{
 #if !defined(CONJ)
 		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
 		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
 #else
 		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
 		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
 #endif
 		ix += inc_x2 ;
 		iy += inc_y2 ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/zcopy.c
+++ b/kernel/arm/zcopy.c
@ -0,0 +1,63 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	if ( n < 0     )  return(0);
 	BLASLONG inc_x2 = 2 * inc_x;
 	BLASLONG inc_y2 = 2 * inc_y;
 	while(i < n)
 	{
 		y[iy]   = x[ix] ;
 		y[iy+1] = x[ix+1] ;
 		ix += inc_x2;
 		iy += inc_y2;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/zcopy_vfp.S
+++ b/kernel/arm/zcopy_vfp.S
@ -0,0 +1,223 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY_F4
 	pld	[ X, #X_PRE  ]
 	pld	[ X, #X_PRE+32  ]
 	fldmiad	X!, { d0 - d7 }
 	fstmiad	Y!, { d0 - d7 }
 .endm
 .macro COPY_F1
 	fldmiad	X!, { d0 - d1 }
 	fstmiad	Y!, { d0 - d1 }
 .endm
 /*************************************************************************************************************************/
 .macro COPY_S4
 	nop
 	fldmiad	X, { d0 - d1 }
 	fstmiad	Y, { d0 - d1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d2 - d3 }
 	fstmiad	Y, { d2 - d3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d0 - d1 }
 	fstmiad	Y, { d0 - d1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d2 - d3 }
 	fstmiad	Y, { d2 - d3 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro COPY_S1
 	fldmiad	X, { d0 - d1 }
 	fstmiad	Y, { d0 - d1 }
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	cmp	N, #0
 	ble	zcopy_kernel_L999
 	cmp	INC_X, #0
 	beq	zcopy_kernel_L999
 	cmp	INC_Y, #0
 	beq	zcopy_kernel_L999
 	cmp	INC_X, #1
 	bne	zcopy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	zcopy_kernel_S_BEGIN
 zcopy_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	zcopy_kernel_F1
 zcopy_kernel_F4:
 	COPY_F4
 	subs	I, I, #1
 	bne	zcopy_kernel_F4
 zcopy_kernel_F1:
 	ands	I, N, #3
 	ble	zcopy_kernel_L999
 zcopy_kernel_F10:
 	COPY_F1
 	subs    I, I, #1
        bne     zcopy_kernel_F10
 	b	zcopy_kernel_L999
 zcopy_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 	asrs	I, N, #2					// I = N / 4
 	ble	zcopy_kernel_S1
 zcopy_kernel_S4:
 	COPY_S4
 	subs	I, I, #1
 	bne	zcopy_kernel_S4
 zcopy_kernel_S1:
 	ands	I, N, #3
 	ble	zcopy_kernel_L999
 zcopy_kernel_S10:
 	COPY_S1
 	subs    I, I, #1
        bne     zcopy_kernel_S10
 zcopy_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@ -0,0 +1,78 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: FAIL
 * 	 BLASTEST double	: FAIL
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <complex.h>
 FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	FLOAT dot[2];
 	FLOAT _Complex result;
 	dot[0]=0.0;
 	dot[1]=0.0;
 	__real__ result = 0.0 ;
 	__imag__ result = 0.0 ;
 	if ( n < 1 )  return(result);
 	BLASLONG inc_x2 = 2 * inc_x ;
 	BLASLONG inc_y2 = 2 * inc_y ;
 	while(i < n)
 	{
 #if !defined(CONJ)
 		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
 		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
 #else
 		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
 		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
 #endif
 		ix  += inc_x2 ;
 		iy  += inc_y2 ;
 		i++ ;
 	}
 	__real__ result = dot[0];
 	__imag__ result = dot[1];
 	return(result);
 }
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@ -0,0 +1,286 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/11 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define OLD_INC_Y	[fp, #4 ]
 #define I	r5
 #define Y	r6
 #define INC_Y	r7
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro KERNEL_F4
 	pld	[ X, #X_PRE  ]
 	pld	[ Y, #X_PRE  ]
 	fldmiad	X!, { d4 - d5 }
 	fldmiad	Y!, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fldmiad	X!, { d6 - d7 }
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	fldmiad	Y!, { d10 - d11 }
 	fmacd   d0  , d6,  d10
 	fmacd   d1  , d6,  d11
 	pld	[ X, #X_PRE  ]
 	fmacd   d2  , d7,  d11
 	fmacd   d3  , d7,  d10
 	pld	[ Y, #X_PRE  ]
 	fldmiad	X!, { d4 - d5 }
 	fldmiad	Y!, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fldmiad	X!, { d6 - d7 }
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	fldmiad	Y!, { d10 - d11 }
 	fmacd   d0  , d6,  d10
 	fmacd   d1  , d6,  d11
 	fmacd   d2  , d7,  d11
 	fmacd   d3  , d7,  d10
 .endm
 .macro KERNEL_F1
 	fldmiad	X!, { d4 - d5 }
 	fldmiad	Y!, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 .endm
 /*************************************************************************************************************************/
 .macro KERNEL_S4
 	nop
 	fldmiad	X, { d4 - d5 }
 	fldmiad	Y, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d4 - d5 }
 	fldmiad	Y, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d4 - d5 }
 	fldmiad	Y, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 	fldmiad	X, { d4 - d5 }
 	fldmiad	Y, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 .macro KERNEL_S1
 	fldmiad	X, { d4 - d5 }
 	fldmiad	Y, { d8 - d9 }
 	fmacd   d0  , d4,  d8
 	fmacd   d1  , d4,  d9
 	fmacd   d2  , d5,  d9
 	fmacd   d3  , d5,  d8
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 	vsub.f64                d0 , d0 , d0
 	vsub.f64                d1 , d1 , d1
 	vsub.f64                d2 , d2 , d2
 	vsub.f64                d3 , d3 , d3
 	cmp	N, #0
 	ble	zdot_kernel_L999
 	cmp	INC_X, #0
 	beq	zdot_kernel_L999
 	cmp	INC_Y, #0
 	beq	zdot_kernel_L999
 	cmp	INC_X, #1
 	bne	zdot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	zdot_kernel_S_BEGIN
 zdot_kernel_F_BEGIN:
 	asrs	I, N, #2					// I = N / 4
 	ble	zdot_kernel_F1
 zdot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	zdot_kernel_F4
 zdot_kernel_F1:
 	ands	I, N, #3
 	ble	zdot_kernel_L999
 zdot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     zdot_kernel_F10
 	b	zdot_kernel_L999
 zdot_kernel_S_BEGIN:
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 	asrs	I, N, #2					// I = N / 4
 	ble	zdot_kernel_S1
 zdot_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	zdot_kernel_S4
 zdot_kernel_S1:
 	ands	I, N, #3
 	ble	zdot_kernel_L999
 zdot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     zdot_kernel_S10
 zdot_kernel_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 #if !defined(CONJ)
 	vsub.f64	d0 , d0, d2				
 	vadd.f64	d1 , d1, d3				
 #else
 	vadd.f64	d0 , d0, d2				
 	vsub.f64	d1 , d1, d3				
 #endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
--- a/kernel/arm/zgemm_ncopy_2_vfp.S
+++ b/kernel/arm/zgemm_ncopy_2_vfp.S
@ -0,0 +1,254 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/05 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define LDA	[fp, #-260 ]
 #define B	[fp, #4 ]
 #define M	r0
 #define N	r1
 #define A	r2
 #define	BO	r5
 #define	AO1	r6
 #define	AO2	r7
 #define I	r3
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	pld	[ AO1, #A_PRE  ]
 	pld	[ AO2, #A_PRE  ]
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fldd	d4 , [ AO1, #16 ]
 	fldd	d5 , [ AO1, #24 ]
 	fldd	d2 , [ AO2, #0  ]
 	fldd	d3 , [ AO2, #8  ]
 	add	AO1, AO1, #32
 	fldd	d6 , [ AO2, #16 ]
 	fldd	d7 , [ AO2, #24 ]
 	fstmiad	BO!, { d0 - d7 }
 	add	AO2, AO2, #32
 .endm
 .macro COPY1x2
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fldd	d2 , [ AO2, #0  ]
 	fldd	d3 , [ AO2, #8  ]
 	add	AO1, AO1, #16
 	fstmiad	BO!, { d0 - d3 }
 	add	AO2, AO2, #16
 .endm
 .macro COPY2x1
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fldd	d2 , [ AO1, #16 ]
 	fldd	d3 , [ AO1, #24 ]
 	fstmiad	BO!, { d0 - d3 }
 	add	AO1, AO1, #32
 .endm
 .macro COPY1x1
 	fldd	d0 , [ AO1, #0  ]
 	fldd	d1 , [ AO1, #8  ]
 	fstmiad	BO!, { d0 - d1 }
 	add	AO1, AO1, #16
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	lsl	r3, r3, #4					// lda = lda * 8 * 2
 	str	r3, LDA
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	ldr	BO, B	
 /*********************************************************************************************/
 zgemm_ncopy_L2_BEGIN:
 	asrs	J, N, #1					// J = N / 2
 	ble	zgemm_ncopy_L1_BEGIN
 zgemm_ncopy_L2_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	AO2, AO1, r4
 	add	A  , AO2, r4 					// A = A + 2 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	zgemm_ncopy_L2_M2_40
 zgemm_ncopy_L2_M2_20:
 	COPY2x2
 	subs	I , I , #1
 	bne	zgemm_ncopy_L2_M2_20
 zgemm_ncopy_L2_M2_40:
 	ands	I, M , #1
 	ble	zgemm_ncopy_L2_M2_END
 zgemm_ncopy_L2_M2_60:
 	COPY1x2
 	subs	I , I , #1
 	bne	zgemm_ncopy_L2_M2_60
 zgemm_ncopy_L2_M2_END:
 	subs    J , J, #1                                               // j--
        bne     zgemm_ncopy_L2_M2_BEGIN
 /*********************************************************************************************/
 zgemm_ncopy_L1_BEGIN:
 	tst	N, #1
 	ble	zgemm_ncopy_L999
 zgemm_ncopy_L1_M2_BEGIN:
 	mov	AO1, A						// AO1 = A
 	ldr	r4 , LDA
 	add	A  , AO1, r4 					// A = A + 1 * LDA
 	asrs	I, M, #1					// I = M / 2
 	ble	zgemm_ncopy_L1_M2_40
 zgemm_ncopy_L1_M2_20:
 	COPY2x1
 	subs	I , I , #1
 	bne	zgemm_ncopy_L1_M2_20
 zgemm_ncopy_L1_M2_40:
 	ands	I, M , #1
 	ble	zgemm_ncopy_L1_M2_END
 zgemm_ncopy_L1_M2_60:
 	COPY1x1
 	subs	I , I , #1
 	bne	zgemm_ncopy_L1_M2_60
 zgemm_ncopy_L1_M2_END:
 zgemm_ncopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	movs	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zgemm_tcopy_2_vfp.S
+++ b/kernel/arm/zgemm_tcopy_2_vfp.S
@ -0,0 +1,245 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/07 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_A	r2
 #define	OLD_LDA	r3
 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
 * for store and restore of floating point
 * registers
 *******************************************************/
 #define B	[fp, #4 ]
 #define A	[fp, #-248 ]
 #define M	r0
 #define N	r1
 #define M4	r2
 #define	LDA	r5
 #define	AO1	r6
 #define	BO1	r7
 #define	BO2	r8
 #define I	r4
 #define	J	r12
 #define A_PRE	256
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 .macro COPY2x2
 	pld	[ AO1, #A_PRE ]
 	fldmiad	AO1, { d0 - d3 }
 	add	r3, AO1, LDA
 	pld	[ r3, #A_PRE ]
 	fldmiad	r3, { d4 - d7 }
 	fstmiad	BO1, { d0 - d7 }
 	add	AO1, AO1, #32
 	add	BO1, BO1, M4
 .endm
 .macro COPY1x2
 	fldmiad	AO1, { d0 -d1 }
 	add	r3, AO1, LDA
 	fldmiad	r3, { d2 - d3 }
 	fstmiad	BO2, { d0 - d3 }
 	add	AO1, AO1, #16
 	add	BO2, BO2, #32
 .endm
 /*************************************************************************************************************************/
 .macro COPY2x1
 	fldmiad	AO1, { d0 - d3 }
 	fstmiad	BO1, { d0 - d3 }
 	add	AO1, AO1, #32
 	add	BO1, BO1, M4
 .endm
 .macro COPY1x1
 	fldmiad	AO1, { d0 - d1 }
 	fstmiad	BO2, { d0 - d1 }
 	add	AO1, AO1, #16
 	add	BO2, BO2, #16
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack
 	str	OLD_A, A					// store A
 	lsl	LDA, OLD_LDA, #4				// lda = lda * SIZE * 2
 	sub	r4, fp, #128
 	vstm	r4, { d8 - d15} 				// store floating point registers
 	lsl	r4 , M, #4					// M * SIZE * 2
 	ldr	r3, B
 	and	BO2 , N , #-2
 	mul	BO2, BO2, r4
 	add	BO2 , BO2, r3
 	lsl	M4, M, #5					// M4 = M * 2 * SIZE * 2
 zgemm_tcopy_L2_BEGIN:
 	asrs	J, M, #1					// J = N / 2
 	ble	zgemm_tcopy_L1_BEGIN
 zgemm_tcopy_L2_M2_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	lsl	r3, LDA, #1					// r3 = 2 * LDA
 	add	r3, r3 , AO1					// A = A + 2 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #64					// B = B + 4 * SIZE *2
 	str	r3, B
 	asrs	I, N, #1					// I = M / 2
 	ble	zgemm_tcopy_L2_M2_60
 zgemm_tcopy_L2_M2_40:
 	COPY2x2
 	subs I, I, #1
 	bne	zgemm_tcopy_L2_M2_40
 zgemm_tcopy_L2_M2_60:
 	tst	N , #1
 	ble	zgemm_tcopy_L2_M2_END
 	COPY1x2
 zgemm_tcopy_L2_M2_END:
 	subs	J , J, #1						// j--
 	bne	zgemm_tcopy_L2_M2_BEGIN
 /*********************************************************************************************/
 zgemm_tcopy_L1_BEGIN:
 	tst	M, #1
 	ble	zgemm_tcopy_L999
 zgemm_tcopy_L1_M2_BEGIN:
 	ldr	AO1, A						// AO1 = A
 	add	r3, LDA , AO1					// A = A + 1 * LDA
 	str	r3, A						// store A
 	ldr	BO1, B
 	add	r3, BO1, #32					// B = B + 2 * SIZE *2
 	str	r3, B
 	asrs	I, N, #1					// I = M / 2
 	ble	zgemm_tcopy_L1_M2_60
 zgemm_tcopy_L1_M2_40:
 	COPY2x1
 	subs I, I, #1
 	bne	zgemm_tcopy_L1_M2_40
 zgemm_tcopy_L1_M2_60:
 	tst	N , #1
 	ble	zgemm_tcopy_L1_M2_END
 	COPY1x1
 zgemm_tcopy_L1_M2_END:
 zgemm_tcopy_L999:
 	sub	r3, fp, #128
 	vldm	r3, { d8 - d15}					// restore floating point registers
 	mov	r0, #0						// set return value
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zgemv_n.c
+++ b/kernel/arm/zgemv_n.c
@ -0,0 +1,157 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * * 2013/11/23 Saar
 * *	 BLASTEST float		: OK
 * * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 * *
 * **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG ix,iy;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT temp_r,temp_i;
 	BLASLONG inc_x2,inc_y2;
 	BLASLONG lda2;
 	BLASLONG i2;
 	lda2 = 2*lda;
 	ix = 0;
 	a_ptr = a;
 	if ( inc_x == 1 && inc_y == 1 )
 	{
 	   for (j=0; j<n; j++)
 	   {
 #if !defined(XCONJ)
 		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
 		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
 #else
 		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
 		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
 #endif
 		iy = 0;
 		i2=0;
 		for (i=0; i<m; i++)
 		{
 #if !defined(CONJ)
 #if !defined(XCONJ)
 			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
 			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
 #else
 			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
 			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
 #endif
 #else
 #if !defined(XCONJ)
 			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
 			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
 #else
 			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
 			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
 #endif
 #endif
 			i2 += 2;
 			iy += 2;
 		}
 		a_ptr += lda2;
 		ix    += 2;
 	   }
 	   return(0);
 	}
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
 	for (j=0; j<n; j++)
 	{
 #if !defined(XCONJ)
 		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
 		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
 #else
 		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
 		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
 #endif
 		iy = 0;
 		i2=0;
 		for (i=0; i<m; i++)
 		{
 #if !defined(CONJ)
 #if !defined(XCONJ)
 			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
 			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
 #else
 			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
 			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
 #endif
 #else
 #if !defined(XCONJ)
 			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
 			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
 #else
 			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
 			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
 #endif
 #endif
 			i2 += 2;
 			iy += inc_y2;
 		}
 		a_ptr += lda2;
 		ix    += inc_x2;
 	}
 	return(0);
 }
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@ -0,0 +1,699 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/29 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_M		r0
 #define AO1	r0
 #define N	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define ALPHA_I [fp, #-236]
 #define ALPHA_R [fp, #-244]
 #define M	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	64
 #define Y_PRE	0
 #define A_PRE	0
 /**************************************************************************************/
 #if !defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fnmacd
        #define KMAC_I  fmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd
 #elif defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd
 #elif !defined(CONJ) && defined(XCONJ)
        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I2 fmacd
 #else
        #define KMAC_R  fnmacd
        #define KMAC_I  fmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I2 fmacd
 #endif
 .macro INIT_F4
 	pld	[ YO, #Y_PRE ]
        vsub.f64                d8 , d8 , d8
        vmov.f64                d9 , d8
        vmov.f64                d10, d8
        vmov.f64                d11, d8
        vmov.f64                d12, d8
        vmov.f64                d13, d8
        vmov.f64                d14, d8
        vmov.f64                d15, d8
 .endm
 .macro KERNEL_F4X4
 	pld	[ XO, #X_PRE ]
 	KERNEL_F4X1
 	KERNEL_F4X1
 	pld	[ XO, #X_PRE ]
 	KERNEL_F4X1
 	KERNEL_F4X1
 .endm
 .macro KERNEL_F4X1
        fldd    d0 , [ AO1 ]
        fldd    d4 , [ XO ]
        fldd    d5 , [ XO, #8 ]
 	pld	[ AO2, #A_PRE ]
        fldd    d1 , [ AO1, #8  ]
        fmacd   d8  , d0,  d4
        fldd    d2 , [ AO1, #16 ]
        fmacd   d9  , d0,  d5
        fldd    d3 , [ AO1, #24 ]
        fmacd   d10 , d2,  d4
        fldd    d0 , [ AO1, #32 ]
        fmacd   d11 , d2,  d5
        KMAC_R  d8  , d1,  d5
        KMAC_I  d9  , d1,  d4
        KMAC_R  d10 , d3,  d5
        fldd    d1 , [ AO1, #40 ]
        KMAC_I  d11 , d3,  d4
        fldd    d2 , [ AO1, #48 ]
        fmacd   d12 , d0,  d4
        fldd    d3 , [ AO1, #56 ]
        fmacd   d13 , d0,  d5
 	pld	[ AO2, #A_PRE+32 ]
        fmacd   d14 , d2,  d4
        fmacd   d15 , d2,  d5
        KMAC_R  d12 , d1,  d5
        add     XO , XO, #16
        KMAC_I  d13 , d1,  d4
        add     AO1 , AO1, LDA
        KMAC_R  d14 , d3,  d5
        add     AO2 , AO2, LDA
        KMAC_I  d15 , d3,  d4
 .endm
 .macro SAVE_F4
        fldd            d0, ALPHA_R
        fldd            d1, ALPHA_I
        fldmiad YO, { d4 - d7 }
        FMAC_R1 d4 , d0 , d8
        FMAC_I1 d5 , d0 , d9
        FMAC_R2 d4 , d1 , d9
        FMAC_I2 d5 , d1 , d8
        FMAC_R1 d6 , d0 , d10
        FMAC_I1 d7 , d0 , d11
        FMAC_R2 d6 , d1 , d11
        FMAC_I2 d7 , d1 , d10
        fstmiad YO!, { d4 - d7 }
        fldmiad YO, { d4 - d7 }
        FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
        FMAC_R1 d6 , d0 , d14
        FMAC_I1 d7 , d0 , d15
        FMAC_R2 d6 , d1 , d15
        FMAC_I2 d7 , d1 , d14
        fstmiad YO!, { d4 - d7 }
 .endm
 .macro INIT_F1
        vsub.f64                d8 , d8 , d8
        vmov.f64                d9 , d8
 .endm
 .macro KERNEL_F1X1
        fldd    d0 , [ AO1 ]
        fldd    d1 , [ AO1, #8 ]
        fldd    d4 , [ XO ]
        fldd    d5 , [ XO, #8 ]
        fmacd   d8  , d0,  d4
        fmacd   d9  , d0,  d5
        KMAC_R  d8  , d1,  d5
        KMAC_I  d9  , d1,  d4
        add     XO , XO, #16
        add     AO1 , AO1, LDA
 .endm
 .macro SAVE_F1
        fldd            d0, ALPHA_R
        fldd            d1, ALPHA_I
        fldmiad YO, { d4 - d5 }
        FMAC_R1 d4 , d0 , d8
        FMAC_I1 d5 , d0 , d9
        FMAC_R2 d4 , d1 , d9
        FMAC_I2 d5 , d1 , d8
        fstmiad YO, { d4 - d5 }
        add     YO, YO, #16
 .endm
 /****************************************************************************************/
 .macro INIT_S4
        vsub.f64                d8 , d8 , d8
        vmov.f64                d9 , d8
        vmov.f64                d10, d8
        vmov.f64                d11, d8
        vmov.f64                d12, d8
        vmov.f64                d13, d8
        vmov.f64                d14, d8
        vmov.f64                d15, d8
 .endm
 .macro KERNEL_S4X4
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 	KERNEL_S4X1
 .endm
 .macro KERNEL_S4X1
        fldd    d0 , [ AO1 ]
        fldd    d1 , [ AO1, #8  ]
        fldd    d2 , [ AO1, #16 ]
        fldd    d3 , [ AO1, #24 ]
        fldd    d4 , [ XO ]
        fldd    d5 , [ XO, #8 ]
        fmacd   d8  , d0,  d4
        fmacd   d9  , d0,  d5
        fmacd   d10 , d2,  d4
        fmacd   d11 , d2,  d5
        KMAC_R  d8  , d1,  d5
        KMAC_I  d9  , d1,  d4
        KMAC_R  d10 , d3,  d5
        KMAC_I  d11 , d3,  d4
        fldd    d0 , [ AO1, #32 ]
        fldd    d1 , [ AO1, #40 ]
        fldd    d2 , [ AO1, #48 ]
        fldd    d3 , [ AO1, #56 ]
        fmacd   d12 , d0,  d4
        fmacd   d13 , d0,  d5
        fmacd   d14 , d2,  d4
        fmacd   d15 , d2,  d5
        KMAC_R  d12 , d1,  d5
        KMAC_I  d13 , d1,  d4
        KMAC_R  d14 , d3,  d5
        KMAC_I  d15 , d3,  d4
        add     XO , XO, INC_X
        add     AO1 , AO1, LDA
        add     AO2 , AO2, LDA
 .endm
 .macro SAVE_S4
        fldd            d0, ALPHA_R
        fldd            d1, ALPHA_I
        fldmiad YO, { d4 - d5 }
        FMAC_R1 d4 , d0 , d8
        FMAC_I1 d5 , d0 , d9
        FMAC_R2 d4 , d1 , d9
        FMAC_I2 d5 , d1 , d8
        fstmiad YO, { d4 - d5 }
 	add	YO, YO, INC_Y
        fldmiad YO, { d6 - d7 }
        FMAC_R1 d6 , d0 , d10
        FMAC_I1 d7 , d0 , d11
        FMAC_R2 d6 , d1 , d11
        FMAC_I2 d7 , d1 , d10
        fstmiad YO, { d6 - d7 }
 	add	YO, YO, INC_Y
        fldmiad YO, { d4 - d5 }
        FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
        fstmiad YO, { d4 - d5 }
 	add	YO, YO, INC_Y
        fldmiad YO, { d6 - d7 }
        FMAC_R1 d6 , d0 , d14
        FMAC_I1 d7 , d0 , d15
        FMAC_R2 d6 , d1 , d15
        FMAC_I2 d7 , d1 , d14
        fstmiad YO, { d6 - d7 }
 	add	YO, YO, INC_Y
 .endm
 .macro INIT_S1
        vsub.f64                d8 , d8 , d8
        vmov.f64                d9 , d8
 .endm
 .macro KERNEL_S1X1
        fldd    d0 , [ AO1 ]
        fldd    d1 , [ AO1, #8 ]
        fldd    d4 , [ XO ]
        fldd    d5 , [ XO, #8 ]
        fmacd   d8  , d0,  d4
        fmacd   d9  , d0,  d5
        KMAC_R  d8  , d1,  d5
        KMAC_I  d9  , d1,  d4
        add     XO , XO, INC_X
        add     AO1 , AO1, LDA
 .endm
 .macro SAVE_S1
        fldd            d0, ALPHA_R
        fldd            d1, ALPHA_I
        fldmiad YO, { d4 - d5 }
        FMAC_R1 d4 , d0 , d8
        FMAC_I1 d5 , d0 , d9
        FMAC_R2 d4 , d1 , d9
        FMAC_I2 d5 , d1 , d8
        fstmiad YO, { d4 - d5 }
        add     YO, YO, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	OLD_M, #0
 	ble	zgemvn_kernel_L999
 	cmp	N, #0
 	ble	zgemvn_kernel_L999
 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    d0 , ALPHA_R
        vstr    d1 , ALPHA_I
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	zgemvn_kernel_L999
 	cmp	INC_Y, #0
 	beq	zgemvn_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #4				// LDA * SIZE * 2
 #else
 	lsl	LDA, LDA, #3				// LDA * SIZE * 2
 #endif
 	cmp	INC_X, #1
 	bne	zgemvn_kernel_S4_BEGIN
 	cmp	INC_Y, #1
 	bne	zgemvn_kernel_S4_BEGIN
 zgemvn_kernel_F4_BEGIN:
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #2					// I = M / 4
 	ble	zgemvn_kernel_F1_BEGIN
 zgemvn_kernel_F4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #64
 	str	r3 , A
 	add	AO2, AO2, LDA
 	add	AO2, AO2, LDA
 	ldr	XO , X
 	INIT_F4
 	asrs	J, N, #2					// J = N / 4
 	ble	zgemvn_kernel_F4X1
 zgemvn_kernel_F4X4_10:
 	KERNEL_F4X4
 	subs	J, J, #1
 	bne	zgemvn_kernel_F4X4_10
 zgemvn_kernel_F4X1:
 	ands	J, N , #3
 	ble	zgemvn_kernel_F4_END
 zgemvn_kernel_F4X1_10:
 	KERNEL_F4X1
 	subs	J, J, #1
 	bne	zgemvn_kernel_F4X1_10
 zgemvn_kernel_F4_END:
 	SAVE_F4
 	subs	I , I , #1
 	bne	zgemvn_kernel_F4X4
 zgemvn_kernel_F1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #3
 	ble	zgemvn_kernel_L999
 zgemvn_kernel_F1X1:
 	ldr	AO1, A
 	add	r3, AO1, #16
 	str	r3, A
 	ldr	XO , X
 	INIT_F1
 	mov	J, N
 zgemvn_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	J, J, #1
 	bne	zgemvn_kernel_F1X1_10
 zgemvn_kernel_F1_END:
 	SAVE_F1
 	subs	I , I , #1
 	bne	zgemvn_kernel_F1X1
 	b	zgemvn_kernel_L999
 /*************************************************************************************************************/
 zgemvn_kernel_S4_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
 #endif
 	ldr	YO , Y
 	ldr	I, M
 	asrs	I, I, #2					// I = M / 4
 	ble	zgemvn_kernel_S1_BEGIN
 zgemvn_kernel_S4X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO1, #64
 	str	r3 , A
 	ldr	XO , X
 	INIT_S4
 	asrs	J, N, #2					// J = N / 4
 	ble	zgemvn_kernel_S4X1
 zgemvn_kernel_S4X4_10:
 	KERNEL_S4X4
 	subs	J, J, #1
 	bne	zgemvn_kernel_S4X4_10
 zgemvn_kernel_S4X1:
 	ands	J, N , #3
 	ble	zgemvn_kernel_S4_END
 zgemvn_kernel_S4X1_10:
 	KERNEL_S4X1
 	subs	J, J, #1
 	bne	zgemvn_kernel_S4X1_10
 zgemvn_kernel_S4_END:
 	SAVE_S4
 	subs	I , I , #1
 	bne	zgemvn_kernel_S4X4
 zgemvn_kernel_S1_BEGIN:
 	ldr	I, M
 	ands	I,  I , #3
 	ble	zgemvn_kernel_L999
 zgemvn_kernel_S1X1:
 	ldr	AO1, A
 	add	r3, AO1, #16
 	str	r3, A
 	ldr	XO , X
 	INIT_S1
 	mov	J, N
 zgemvn_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	J, J, #1
 	bne	zgemvn_kernel_S1X1_10
 zgemvn_kernel_S1_END:
 	SAVE_S1
 	subs	I , I , #1
 	bne	zgemvn_kernel_S1X1
 /*************************************************************************************************************/
 zgemvn_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/zgemv_t.c
+++ b/kernel/arm/zgemv_t.c
@ -0,0 +1,140 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * * 2013/11/23 Saar
 * *	 BLASTEST float		: OK
 * * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 * *
 * **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG ix,iy;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT temp_r,temp_i;
 	BLASLONG inc_x2,inc_y2;
 	BLASLONG lda2;
 	BLASLONG i2;
 	lda2 = 2*lda;
 	iy = 0;
 	a_ptr = a;
 	if ( inc_x == 1 && inc_y == 1 )
 	{
 	   for (j=0; j<n; j++)
 	   {
 		temp_r = 0.0;
 		temp_i = 0.0;
 		ix = 0;
 		i2=0;
 		for (i=0; i<m; i++)
 		{
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
 			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
 #else
 			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
 			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
 #endif
 			i2 += 2;
 			ix += 2;
 		}
 #if !defined(XCONJ)
 		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
 		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
 #else
 		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
 		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
 #endif
 		a_ptr += lda2;
 		iy    += 2;
 	   } 
 	   return(0);
 	}
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
 	for (j=0; j<n; j++)
 	{
 		temp_r = 0.0;
 		temp_i = 0.0;
 		ix = 0;
 		i2=0;
 		for (i=0; i<m; i++)
 		{
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
 			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
 #else
 			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
 			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
 #endif
 			i2 += 2;
 			ix += inc_x2;
 		}
 #if !defined(XCONJ)
 		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
 		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
 #else
 		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
 		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
 #endif
 		a_ptr += lda2;
 		iy    += inc_y2;
 	}
 	return(0);
 }
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@ -0,0 +1,608 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/29 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #define	OLD_N		r1
 #define M	r0
 #define AO1	r1
 #define J	r2
 #define AO2	r4
 #define XO	r5
 #define YO	r6
 #define LDA	r7
 #define INC_X	r8
 #define INC_Y	r9
 #define I	r12
 #define N	[fp, #-252 ]
 #define A	[fp, #-256 ]
 #define X_PRE	512
 #define A_PRE	512
 #define Y_PRE	32
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if !defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fnmacd
        #define KMAC_I  fmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd
 #elif defined(CONJ) && !defined(XCONJ)
        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd
 #elif !defined(CONJ) && defined(XCONJ)
        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I2 fmacd
 #else
        #define KMAC_R  fnmacd
        #define KMAC_I  fmacd
        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I2 fmacd
 #endif
 .macro INIT_F2
 	vsub.f64	d12, d12, d12
 	vsub.f64	d13, d13, d13
 	vsub.f64	d14, d14, d14
 	vsub.f64	d15, d15, d15
 .endm
 .macro KERNEL_F2X4
 	KERNEL_F2X1
 	KERNEL_F2X1
 	KERNEL_F2X1
 	KERNEL_F2X1
 .endm
 .macro KERNEL_F2X1
 	fldmiad	XO! ,  { d2 - d3 }
 	fldmiad	AO1!,  { d4 - d5 }
 	fmacd	d12 , d4 , d2
 	fmacd	d13 , d4 , d3
 	fldmiad	AO2!,  { d8 - d9   }
 	KMAC_R  d12 , d5 , d3
        KMAC_I  d13 , d5 , d2
 	fmacd	d14 , d8 , d2
 	fmacd	d15 , d8 , d3
        KMAC_R  d14 , d9 , d3
        KMAC_I  d15 , d9 , d2
 .endm
 .macro	SAVE_F2
 	fldmiad	YO,  { d4 - d7 }
 	FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
        FMAC_R1 d6 , d0 , d14
        FMAC_I1 d7 , d0 , d15
        FMAC_R2 d6 , d1 , d15
        FMAC_I2 d7 , d1 , d14
 	fstmiad	YO!, { d4 - d7 }
 .endm
 /************************************************************************************************/
 .macro INIT_F1
 	vsub.f64	d12, d12, d12
 	vsub.f64	d13, d13, d13
 .endm
 .macro KERNEL_F1X4
 	KERNEL_F1X1
 	KERNEL_F1X1
 	KERNEL_F1X1
 	KERNEL_F1X1
 .endm
 .macro KERNEL_F1X1
 	fldmiad	XO! ,  { d2 - d3 }
 	fldmiad	AO1!,  { d4 - d5 }
 	fmacd	d12 , d4 , d2
 	fmacd	d13 , d4 , d3
 	KMAC_R  d12 , d5 , d3
        KMAC_I  d13 , d5 , d2
 .endm
 .macro	SAVE_F1
 	fldmiad	YO,  { d4 - d5 }
 	FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
 	fstmiad	YO!, { d4 - d5 }
 .endm
 /************************************************************************************************/
 .macro INIT_S2
 	vsub.f64	d12, d12, d12
 	vsub.f64	d13, d13, d13
 	vsub.f64	d14, d14, d14
 	vsub.f64	d15, d15, d15
 .endm
 .macro KERNEL_S2X4
 	KERNEL_S2X1
 	KERNEL_S2X1
 	KERNEL_S2X1
 	KERNEL_S2X1
 .endm
 .macro KERNEL_S2X1
 	fldmiad	XO  ,  { d2 - d3 }
 	fldmiad	AO1!,  { d4 - d5 }
 	fldmiad	AO2!,  { d8 - d9   }
 	fmacd	d12 , d4 , d2
 	fmacd	d13 , d4 , d3
 	KMAC_R  d12 , d5 , d3
        KMAC_I  d13 , d5 , d2
 	fmacd	d14 , d8 , d2
 	fmacd	d15 , d8 , d3
        KMAC_R  d14 , d9 , d3
        KMAC_I  d15 , d9 , d2
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S2
 	fldmiad	YO,  { d4 - d5 }
 	FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
 	fstmiad	YO,  { d4 - d5 }
 	add	YO, YO, INC_Y
 	fldmiad	YO,  { d6 - d7 }
        FMAC_R1 d6 , d0 , d14
        FMAC_I1 d7 , d0 , d15
        FMAC_R2 d6 , d1 , d15
        FMAC_I2 d7 , d1 , d14
 	fstmiad	YO,  { d6 - d7 }
 	add	YO, YO, INC_Y
 .endm
 /************************************************************************************************/
 .macro INIT_S1
 	vsub.f64	d12, d12, d12
 	vsub.f64	d13, d13, d13
 .endm
 .macro KERNEL_S1X4
 	KERNEL_S1X1
 	KERNEL_S1X1
 	KERNEL_S1X1
 	KERNEL_S1X1
 .endm
 .macro KERNEL_S1X1
 	fldmiad	XO  ,  { d2 - d3 }
 	fldmiad	AO1!,  { d4 - d5 }
 	fmacd	d12 , d4 , d2
 	fmacd	d13 , d4 , d3
 	KMAC_R  d12 , d5 , d3
        KMAC_I  d13 , d5 , d2
 	add	XO, XO, INC_X
 .endm
 .macro	SAVE_S1
 	fldmiad	YO,  { d4 - d5 }
 	FMAC_R1 d4 , d0 , d12
        FMAC_I1 d5 , d0 , d13
        FMAC_R2 d4 , d1 , d13
        FMAC_I2 d5 , d1 , d12
 	fstmiad	YO,  { d4 - d5 }
 	add	YO, YO, INC_Y
 .endm
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 	push    {r4 - r9 , fp}
        add     fp, sp, #28
 	sub     sp, sp, #STACKSIZE                              // reserve stack
        sub     r12, fp, #192
 #if	defined(DOUBLE)
        vstm    r12, { d8 - d15 }                                 // store floating point registers
 #else
        vstm    r12, { s8 - s15 }                                 // store floating point registers
 #endif
 	cmp	M, #0
 	ble	zgemvt_kernel_L999
 	cmp	OLD_N, #0
 	ble	zgemvt_kernel_L999
 	str	OLD_A, A
 	str	OLD_N, N
 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y
 	cmp	INC_X, #0
 	beq	zgemvt_kernel_L999
 	cmp	INC_Y, #0
 	beq	zgemvt_kernel_L999
 	ldr	LDA, OLD_LDA
 #if defined(DOUBLE)
 	lsl	LDA, LDA, #4				// LDA * SIZE
 #else
 	lsl	LDA, LDA, #3				// LDA * SIZE
 #endif
 	cmp	INC_X, #1
 	bne	zgemvt_kernel_S2_BEGIN
 	cmp	INC_Y, #1
 	bne	zgemvt_kernel_S2_BEGIN
 zgemvt_kernel_F2_BEGIN:
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	zgemvt_kernel_F1_BEGIN
 zgemvt_kernel_F2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_F2
 	asrs	I, M, #2					// I = M / 4
 	ble	zgemvt_kernel_F2X1
 zgemvt_kernel_F2X4_10:
 	KERNEL_F2X4
 	subs	I, I, #1
 	bne	zgemvt_kernel_F2X4_10
 zgemvt_kernel_F2X1:
 	ands	I, M , #3
 	ble	zgemvt_kernel_F2_END
 zgemvt_kernel_F2X1_10:
 	KERNEL_F2X1
 	subs	I, I, #1
 	bne	zgemvt_kernel_F2X1_10
 zgemvt_kernel_F2_END:
 	SAVE_F2
 	subs	J , J , #1
 	bne	zgemvt_kernel_F2X4
 zgemvt_kernel_F1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	zgemvt_kernel_L999
 zgemvt_kernel_F1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_F1
 	asrs	I, M, #2					// I = M / 4
 	ble	zgemvt_kernel_F1X1
 zgemvt_kernel_F1X4_10:
 	KERNEL_F1X4
 	subs	I, I, #1
 	bne	zgemvt_kernel_F1X4_10
 zgemvt_kernel_F1X1:
 	ands	I, M , #3
 	ble	zgemvt_kernel_F1_END
 zgemvt_kernel_F1X1_10:
 	KERNEL_F1X1
 	subs	I, I, #1
 	bne	zgemvt_kernel_F1X1_10
 zgemvt_kernel_F1_END:
 	SAVE_F1
 	b	zgemvt_kernel_L999
 /*************************************************************************************************************/
 zgemvt_kernel_S2_BEGIN:
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
 #endif
 	ldr	YO , Y
 	ldr	J, N
 	asrs	J, J, #1					// J = N / 2
 	ble	zgemvt_kernel_S1_BEGIN
 zgemvt_kernel_S2X4:
 	ldr	AO1, A
 	add	AO2, AO1, LDA
 	add	r3 , AO2, LDA
 	str	r3 , A
 	ldr	XO , X
 	INIT_S2
 	asrs	I, M, #2					// I = M / 4
 	ble	zgemvt_kernel_S2X1
 zgemvt_kernel_S2X4_10:
 	KERNEL_S2X4
 	subs	I, I, #1
 	bne	zgemvt_kernel_S2X4_10
 zgemvt_kernel_S2X1:
 	ands	I, M , #3
 	ble	zgemvt_kernel_S2_END
 zgemvt_kernel_S2X1_10:
 	KERNEL_S2X1
 	subs	I, I, #1
 	bne	zgemvt_kernel_S2X1_10
 zgemvt_kernel_S2_END:
 	SAVE_S2
 	subs	J , J , #1
 	bne	zgemvt_kernel_S2X4
 zgemvt_kernel_S1_BEGIN:
 	ldr	J, N
 	ands	J, J, #1
 	ble	zgemvt_kernel_L999
 zgemvt_kernel_S1X4:
 	ldr	AO1, A
 	ldr	XO , X
 	INIT_S1
 	asrs	I, M, #2					// I = M / 4
 	ble	zgemvt_kernel_S1X1
 zgemvt_kernel_S1X4_10:
 	KERNEL_S1X4
 	subs	I, I, #1
 	bne	zgemvt_kernel_S1X4_10
 zgemvt_kernel_S1X1:
 	ands	I, M , #3
 	ble	zgemvt_kernel_S1_END
 zgemvt_kernel_S1X1_10:
 	KERNEL_S1X1
 	subs	I, I, #1
 	bne	zgemvt_kernel_S1X1_10
 zgemvt_kernel_S1_END:
 	SAVE_S1
 /*************************************************************************************************************/
 zgemvt_kernel_L999:
        sub     r3, fp, #192
 #if	defined(DOUBLE)
        vldm    r3, { d8 - d15 }                                 // restore floating point registers
 #else
        vldm    r3, { s8 - s15 }                                 // restore floating point registers
 #endif
 	mov	r0, #0		// set return value
 	sub     sp, fp, #28
 	pop     {r4 -r9 ,fp}
 	bx	lr
 	EPILOGUE
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@ -0,0 +1,106 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/13 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 #include <math.h>
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT scale = 0.0;
 	FLOAT ssq   = 1.0;
 	BLASLONG inc_x2;
 	FLOAT temp;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	inc_x2 = 2 * inc_x;
 	n *= inc_x2;
 	while(i < n)
 	{
 		if ( x[i] != 0.0 )
 		{
 			temp = ABS( x[i] );
 			if ( scale < temp )
 			{
 				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
 				scale = temp ;
 			}
 			else
 			{
 				ssq += ( temp / scale ) * ( temp / scale );
 			}		
 		}
 		if ( x[i+1] != 0.0 )
 		{
 			temp = ABS( x[i+1] );
 			if ( scale < temp )
 			{
 				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
 				scale = temp ;
 			}
 			else
 			{
 				ssq += ( temp / scale ) * ( temp / scale );
 			}		
 		}
 		i += inc_x2;
 	}
 	scale = scale * sqrt( ssq );
 	return(scale);
 }
--- a/kernel/arm/zrot.c
+++ b/kernel/arm/zrot.c
@ -0,0 +1,68 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	FLOAT temp[2];
 	if ( n <= 0     )  return(0);
 	BLASLONG inc_x2 = 2 * inc_x ;
 	BLASLONG inc_y2 = 2 * inc_y ;
 	while(i < n)
 	{
 		temp[0]   = c*x[ix]   + s*y[iy] ;
 		temp[1]   = c*x[ix+1] + s*y[iy+1] ;
 		y[iy]     = c*y[iy]   - s*x[ix] ;
 		y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
 		x[ix]     = temp[0] ;
 		x[ix+1]   = temp[1] ;
 		ix += inc_x2 ;
 		iy += inc_y2 ;
 		i++ ;
 	}
 	return(0);
 }
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@ -0,0 +1,64 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #include "common.h"
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG inc_x2;
 	BLASLONG ip = 0;
 	FLOAT temp;
 	if ( n < 0 || inc_x < 1     )  return(0);
 	inc_x2 = 2 * inc_x;
 	for ( i=0; i<n; i++ )
 	{
 		temp    = da_r * x[ip]   - da_i * x[ip+1] ;
 		x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 		x[ip]   = temp;
 		ip += inc_x2;
 	}
 	return(0);
 }
--- a/Show More
+++ b/Show More