diff --git a/Makefile b/Makefile index b34bb91a5..343bd72f4 100644 --- a/Makefile +++ b/Makefile @@ -262,11 +262,10 @@ endif lapack-test : - $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib - $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc - @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING - $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out + make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib + make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) + dummy : diff --git a/Makefile.arm b/Makefile.arm index 8502d5286..5bdd4d151 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -10,3 +10,9 @@ FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 endif +ifeq ($(CORE), ARMV5) +CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +endif + + diff --git a/Makefile.rule b/Makefile.rule index e4b0465f5..6b604b781 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -76,10 +76,10 @@ VERSION = 0.2.9.rc2 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. -# NO_WARMUP = 1 +NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -# NO_AFFINITY = 1 +NO_AFFINITY = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. @@ -129,6 +129,9 @@ VERSION = 0.2.9.rc2 # The default -O2 is enough. # COMMON_OPT = -O2 +# gfortran option for LAPACK +FCOMMON_OPT = -frecursive + # Profiling flags COMMON_PROF = -pg diff --git a/Makefile.system b/Makefile.system index 7ba45f2e1..ade4f9320 100644 --- a/Makefile.system +++ b/Makefile.system @@ -158,6 +158,7 @@ endif ifeq ($(OSNAME), Linux) EXTRALIB += -lm +NO_EXPRECISION = 1 endif ifeq ($(OSNAME), AIX) @@ -846,19 +847,6 @@ ifeq ($(DEBUG), 1) COMMON_OPT += -g endif -ifndef COMMON_OPT -ifeq ($(ARCH), arm) -COMMON_OPT = -O3 -endif -endif - -ifndef COMMON_OPT -ifeq ($(ARCH), arm64) -COMMON_OPT = -O3 -endif -endif - - ifndef COMMON_OPT COMMON_OPT = -O2 endif diff --git a/common.h b/common.h index 57294eb02..49e2946e7 100644 --- a/common.h +++ b/common.h @@ -310,10 +310,17 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif -#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif +#ifdef BULLDOZER +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + #ifdef PILEDRIVER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); diff --git a/driver/others/Makefile b/driver/others/Makefile index d7a965c9a..ca05c5129 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -3,7 +3,7 @@ include ../../Makefile.system COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) -COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) +#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) diff --git a/getarch.c b/getarch.c index 6e1938434..b100eb52f 100644 --- a/getarch.c +++ b/getarch.c @@ -724,6 +724,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV5 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV5" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV5 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP" +#define LIBNAME "armv5" +#define CORENAME "ARMV5" +#else +#endif + + #ifdef FORCE_ARMV8 #define FORCE #define ARCHITECTURE "ARM64" diff --git a/interface/Makefile b/interface/Makefile index a700e6542..16d59a6e6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -2,11 +2,11 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifeq ($(ARCH), x86) -SUPPORT_GEMM3M = 1 +SUPPORT_GEMM3M = 0 endif ifeq ($(ARCH), x86_64) -SUPPORT_GEMM3M = 1 +SUPPORT_GEMM3M = 0 endif ifeq ($(ARCH), ia64) @@ -342,30 +342,51 @@ CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) +#SLAPACKOBJS = \ +# sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \ +# spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \ +# slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \ + SLAPACKOBJS = \ - sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \ - spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \ - slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \ + sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ + spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) + + +#DLAPACKOBJS = \ +# dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \ +# dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \ +# dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \ DLAPACKOBJS = \ - dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \ - dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \ - dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \ + dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ + dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) + QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ +#CLAPACKOBJS = \ +# cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \ +# cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \ +# claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \ + CLAPACKOBJS = \ - cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \ - cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \ - claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \ + cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ + cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) + + +#ZLAPACKOBJS = \ +# zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \ +# zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \ +# zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \ ZLAPACKOBJS = \ - zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \ - zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \ - zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \ + zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ + zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) + + XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ @@ -375,10 +396,10 @@ XLAPACKOBJS = \ ifneq ($(NO_LAPACK), 1) SBLASOBJS += $(SLAPACKOBJS) DBLASOBJS += $(DLAPACKOBJS) -QBLASOBJS += $(QLAPACKOBJS) +#QBLASOBJS += $(QLAPACKOBJS) CBLASOBJS += $(CLAPACKOBJS) ZBLASOBJS += $(ZLAPACKOBJS) -XBLASOBJS += $(XLAPACKOBJS) +#XBLASOBJS += $(XLAPACKOBJS) endif @@ -1731,37 +1752,37 @@ cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) -sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : getf2.c +sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : getf2.c +dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : lapack/getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : zgetf2.c +cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : lapack/zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : zgetf2.c +zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : lapack/zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : getrf.c +sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : lapack/getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : getrf.c +dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : lapack/getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : zgetrf.c +cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : lapack/zgetrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : zgetrf.c +zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : lapack/zgetrf.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c @@ -1803,37 +1824,37 @@ zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : zlauum.c xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c $(CC) -c $(CFLAGS) $< -o $(@F) -spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : potf2.c +spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : lapack/potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : potf2.c +dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : lapack/potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : zpotf2.c +cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : lapack/zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : zpotf2.c +zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : lapack/zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) -spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : potrf.c +spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : lapack/potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : potrf.c +dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : lapack/potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : zpotrf.c +cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : lapack/zpotrf.c $(CC) -c $(CFLAGS) $< -o $(@F) -zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : zpotrf.c +zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : lapack/zpotrf.c $(CC) -c $(CFLAGS) $< -o $(@F) xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c @@ -1875,55 +1896,55 @@ ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) -slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : laswp.c +slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : lapack/laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) -dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : laswp.c +dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : lapack/laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) -claswp.$(SUFFIX) claswp.$(PSUFFIX) : zlaswp.c +claswp.$(SUFFIX) claswp.$(PSUFFIX) : lapack/zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) -zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : zlaswp.c +zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : lapack/zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) -sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : getrs.c +sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : getrs.c +dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : zgetrs.c +cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : zgetrs.c +zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : gesv.c +sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) -dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : gesv.c +dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) -cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : gesv.c +cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) -zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : gesv.c +zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c diff --git a/interface/gesv.c b/interface/lapack/gesv.c similarity index 100% rename from interface/gesv.c rename to interface/lapack/gesv.c diff --git a/interface/getf2.c b/interface/lapack/getf2.c similarity index 100% rename from interface/getf2.c rename to interface/lapack/getf2.c diff --git a/interface/getrf.c b/interface/lapack/getrf.c similarity index 100% rename from interface/getrf.c rename to interface/lapack/getrf.c diff --git a/interface/getrs.c b/interface/lapack/getrs.c similarity index 100% rename from interface/getrs.c rename to interface/lapack/getrs.c diff --git a/interface/larf.c b/interface/lapack/larf.c.obsolete similarity index 100% rename from interface/larf.c rename to interface/lapack/larf.c.obsolete diff --git a/interface/laswp.c b/interface/lapack/laswp.c similarity index 100% rename from interface/laswp.c rename to interface/lapack/laswp.c diff --git a/interface/lauu2.c b/interface/lapack/lauu2.c.bad similarity index 100% rename from interface/lauu2.c rename to interface/lapack/lauu2.c.bad diff --git a/interface/lauum.c b/interface/lapack/lauum.c.bad similarity index 100% rename from interface/lauum.c rename to interface/lapack/lauum.c.bad diff --git a/interface/potf2.c b/interface/lapack/potf2.c similarity index 100% rename from interface/potf2.c rename to interface/lapack/potf2.c diff --git a/interface/potrf.c b/interface/lapack/potrf.c similarity index 100% rename from interface/potrf.c rename to interface/lapack/potrf.c diff --git a/interface/potri.c b/interface/lapack/potri.c.bad similarity index 100% rename from interface/potri.c rename to interface/lapack/potri.c.bad diff --git a/interface/trti2.c b/interface/lapack/trti2.c.bad similarity index 100% rename from interface/trti2.c rename to interface/lapack/trti2.c.bad diff --git a/interface/trtri.c b/interface/lapack/trtri.c.bad similarity index 100% rename from interface/trtri.c rename to interface/lapack/trtri.c.bad diff --git a/interface/zgetf2.c b/interface/lapack/zgetf2.c similarity index 100% rename from interface/zgetf2.c rename to interface/lapack/zgetf2.c diff --git a/interface/zgetrf.c b/interface/lapack/zgetrf.c similarity index 100% rename from interface/zgetrf.c rename to interface/lapack/zgetrf.c diff --git a/interface/zgetrs.c b/interface/lapack/zgetrs.c similarity index 100% rename from interface/zgetrs.c rename to interface/lapack/zgetrs.c diff --git a/interface/zlaswp.c b/interface/lapack/zlaswp.c similarity index 100% rename from interface/zlaswp.c rename to interface/lapack/zlaswp.c diff --git a/interface/zlauu2.c b/interface/lapack/zlauu2.c.bad similarity index 100% rename from interface/zlauu2.c rename to interface/lapack/zlauu2.c.bad diff --git a/interface/zlauum.c b/interface/lapack/zlauum.c.bad similarity index 100% rename from interface/zlauum.c rename to interface/lapack/zlauum.c.bad diff --git a/interface/zpotf2.c b/interface/lapack/zpotf2.c similarity index 100% rename from interface/zpotf2.c rename to interface/lapack/zpotf2.c diff --git a/interface/zpotrf.c b/interface/lapack/zpotrf.c similarity index 100% rename from interface/zpotrf.c rename to interface/lapack/zpotrf.c diff --git a/interface/zpotri.c b/interface/lapack/zpotri.c.bad similarity index 100% rename from interface/zpotri.c rename to interface/lapack/zpotri.c.bad diff --git a/interface/ztrti2.c b/interface/lapack/ztrti2.c.bad similarity index 100% rename from interface/ztrti2.c rename to interface/lapack/ztrti2.c.bad diff --git a/interface/ztrtri.c b/interface/lapack/ztrtri.c.bad similarity index 100% rename from interface/ztrtri.c rename to interface/lapack/ztrtri.c.bad diff --git a/interface/sbmv.c b/interface/sbmv.c index 2ffe7f166..c481d5609 100644 --- a/interface/sbmv.c +++ b/interface/sbmv.c @@ -61,7 +61,7 @@ static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLA #endif }; -#ifdef SMP +#ifdef SMPBUG static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qsbmv_thread_U, qsbmv_thread_L, @@ -90,7 +90,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * blasint info; int uplo; FLOAT *buffer; -#ifdef SMP +#ifdef SMPBUG int nthreads; #endif @@ -130,7 +130,7 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; int uplo; blasint info; -#ifdef SMP +#ifdef SMPBUG int nthreads; #endif @@ -189,7 +189,7 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); -#ifdef SMP +#ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { @@ -197,7 +197,7 @@ void CNAME(enum CBLAS_ORDER order, (sbmv[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer); -#ifdef SMP +#ifdef SMPBUG } else { (sbmv_thread[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer, nthreads); diff --git a/interface/spmv.c b/interface/spmv.c index 8d8902763..3f853e56e 100644 --- a/interface/spmv.c +++ b/interface/spmv.c @@ -61,7 +61,7 @@ static int (*spmv[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLAS #endif }; -#ifdef SMP +#ifdef SMPTEST static int (*spmv_thread[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qspmv_thread_U, qspmv_thread_L, @@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint info; int uplo; FLOAT *buffer; -#ifdef SMP +#ifdef SMPTEST int nthreads; #endif @@ -126,7 +126,7 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; int uplo; blasint info; -#ifdef SMP +#ifdef SMPTEST int nthreads; #endif @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); -#ifdef SMP +#ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { @@ -189,7 +189,7 @@ void CNAME(enum CBLAS_ORDER order, (spmv[uplo])(n, alpha, a, x, incx, y, incy, buffer); -#ifdef SMP +#ifdef SMPTEST } else { (spmv_thread[uplo])(n, alpha, a, x, incx, y, incy, buffer, nthreads); diff --git a/interface/syr2k.c b/interface/syr2k.c index 01fbe648f..381e088a6 100644 --- a/interface/syr2k.c +++ b/interface/syr2k.c @@ -145,12 +145,21 @@ void NAME(char *UPLO, char *TRANS, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; +#ifndef COMPLEX if (trans_arg == 'N') trans = 0; -#ifndef HEMM if (trans_arg == 'T') trans = 1; - if (trans_arg == 'R') trans = 0; -#endif if (trans_arg == 'C') trans = 1; +#else +#ifdef HEMM + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'C') trans = 1; +#else + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; +#endif + +#endif + nrowa = args.n; if (trans & 1) nrowa = args.k; diff --git a/interface/syrk.c b/interface/syrk.c index b85e2c880..072cc86f5 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -148,12 +148,21 @@ void NAME(char *UPLO, char *TRANS, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; + +#ifndef COMPLEX if (trans_arg == 'N') trans = 0; -#ifndef HEMM if (trans_arg == 'T') trans = 1; - if (trans_arg == 'R') trans = 0; -#endif if (trans_arg == 'C') trans = 1; +#else +#ifdef HEMM + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'C') trans = 1; +#else + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; +#endif + +#endif nrowa = args.n; if (trans & 1) nrowa = args.k; diff --git a/interface/zhbmv.c b/interface/zhbmv.c index c14ad9859..00ba915db 100644 --- a/interface/zhbmv.c +++ b/interface/zhbmv.c @@ -61,7 +61,7 @@ static int (*hbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT #endif }; -#ifdef SMP +#ifdef SMPBUG static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xhbmv_thread_U, xhbmv_thread_L, xhbmv_thread_V, xhbmv_thread_M, @@ -92,7 +92,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * blasint info; int uplo; FLOAT *buffer; -#ifdef SMP +#ifdef SMPBUG int nthreads; #endif @@ -138,7 +138,7 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; int uplo; blasint info; -#ifdef SMP +#ifdef SMPBUG int nthreads; #endif @@ -197,7 +197,7 @@ void CNAME(enum CBLAS_ORDER order, buffer = (FLOAT *)blas_memory_alloc(1); -#ifdef SMP +#ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { @@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order, (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); -#ifdef SMP +#ifdef SMPBUG } else { (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); diff --git a/interface/zsbmv.c b/interface/zsbmv.c index 71c03a660..6d445d7ee 100644 --- a/interface/zsbmv.c +++ b/interface/zsbmv.c @@ -61,7 +61,7 @@ static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT #endif }; -#ifdef SMP +#ifdef SMPBUG static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xsbmv_thread_U, xsbmv_thread_L, @@ -90,7 +90,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * blasint info; int uplo; FLOAT *buffer; -#ifdef SMP +#ifdef SMPBUG int nthreads; #endif @@ -131,7 +131,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * buffer = (FLOAT *)blas_memory_alloc(1); -#ifdef SMP +#ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { @@ -139,7 +139,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * (sbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); -#ifdef SMP +#ifdef SMPBUG } else { (sbmv_thread[uplo])(n, k, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); diff --git a/interface/zspmv.c b/interface/zspmv.c index ecf1af586..65550872d 100644 --- a/interface/zspmv.c +++ b/interface/zspmv.c @@ -61,7 +61,7 @@ static int (*spmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT #endif }; -#ifdef SMP +#ifdef SMPTEST static int (*spmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xspmv_thread_U, xspmv_thread_L, @@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint info; int uplo; FLOAT *buffer; -#ifdef SMP +#ifdef SMPTEST int nthreads; #endif @@ -127,7 +127,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, buffer = (FLOAT *)blas_memory_alloc(1); -#ifdef SMP +#ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { @@ -135,7 +135,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, (spmv[uplo])(n, alpha_r, alpha_i, a, b, incx, c, incy, buffer); -#ifdef SMP +#ifdef SMPTEST } else { diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5 new file mode 100644 index 000000000..ecf278cf9 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV5 @@ -0,0 +1,134 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 2810c6500..6edcf1c48 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,11 +1,20 @@ SGEMVNKERNEL = ../arm/gemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVNKERNEL = ../arm/gemv_n.c +DGEMVTKERNEL = ../arm/gemv_t.c + CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +#ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +#ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +#ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +#ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c + + #STRMMKERNEL = ../generic/trmmkernel_2x2.c #SGEMMKERNEL = ../generic/gemmkernel_2x2.c #SGEMMONCOPY = ../generic/gemm_ncopy_2.c @@ -86,18 +95,18 @@ CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S # BAD SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S +# BAD DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S # BAD SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S +# BAD DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -# CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +#CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 1a1c7a2e0..878e3cdd8 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -7,15 +7,19 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_2x8_nehalem.S -DGEMMINCOPY = dgemm_ncopy_2.S -DGEMMITCOPY = dgemm_tcopy_2.S -DGEMMONCOPY = ../generic/gemm_ncopy_8.c -DGEMMOTCOPY = dgemm_tcopy_8.S -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) + + +DGEMMKERNEL = gemm_kernel_4x4_core2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S CGEMMINCOPY = zgemm_ncopy_2.S CGEMMITCOPY = zgemm_tcopy_2.S @@ -40,10 +44,11 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S -DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S -DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S -DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S -DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S + CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index c321be752..4d095d21f 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,34 +1,35 @@ -SGEMMKERNEL = sgemm_kernel_8x8_sandy.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + + DGEMMKERNEL = dgemm_kernel_4x8_sandy.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c -#DGEMMONCOPY = gemm_ncopy_4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S -CGEMMKERNEL = cgemm_kernel_4x8_sandy.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c + +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S + + ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S ZGEMMINCOPY = ZGEMMITCOPY = @@ -58,6 +59,7 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/lapack-devel.log b/lapack-devel.log new file mode 100644 index 000000000..8243bb890 --- /dev/null +++ b/lapack-devel.log @@ -0,0 +1,19 @@ +======================================================================================== +2014/05/07 Saar + +Platform: BULLDOZER single thread + + + --> LAPACK TESTING SUMMARY <-- + Processing LAPACK Testing output found in the TESTING direcory +SUMMARY nb test run numerical error other error +================ =========== ================= ================ +REAL 1079349 0 (0.000%) 0 (0.000%) +DOUBLE PRECISION 1080161 0 (0.000%) 0 (0.000%) +COMPLEX 556022 0 (0.000%) 0 (0.000%) +COMPLEX16 556834 0 (0.000%) 0 (0.000%) + +--> ALL PRECISIONS 3272366 0 (0.000%) 0 (0.000%) + +======================================================================================== + diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 85154280f..d8cef80ba 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -54,9 +54,9 @@ include ../make.inc # ####################################################################### -ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla_array.o iparmq.o \ +ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ - ../INSTALL/ilaver.o + ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o SCLAUX = \ sbdsdc.o \ @@ -92,7 +92,7 @@ DZLAUX = \ dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o \ - ../INSTALL/dsecnd_$(TIMER).o + ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o SLASRC = \ sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ @@ -101,7 +101,7 @@ SLASRC = \ sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ sgeqp3.o sgeqpf.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ - sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o \ + sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesvd.o sgesvx.o \ sgetc2.o sgetri.o \ sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ sggglm.o sgghrd.o sgglse.o sggqrf.o \ @@ -120,7 +120,7 @@ SLASRC = \ slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slasy2.o slasyf.o slasyf_rook.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ - sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ + slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ @@ -147,7 +147,7 @@ SLASRC = \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stptrs.o \ strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ - strtrs.o stzrqf.o stzrzf.o sstemr.o \ + strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ @@ -157,7 +157,7 @@ SLASRC = \ sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ stpqrt.o stpqrt2.o stpmqrt.o stprfb.o -DSLASRC = spotrs.o +DSLASRC = spotrs.o ifdef USEXBLAS SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ @@ -176,7 +176,7 @@ CLASRC = \ cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ cgeqpf.o cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ - cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o \ + cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesvd.o \ cgesvx.o cgetc2.o cgetri.o \ cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ cgghrd.o cgglse.o cggqrf.o cggrqf.o \ @@ -208,7 +208,7 @@ CLASRC = \ clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ clasyf.o clasyf_rook.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ - clatzm.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ + clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ cposv.o cposvx.o cpotri.o cpstrf.o cpstf2.o \ cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ @@ -225,7 +225,7 @@ CLASRC = \ ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ ctprfs.o ctptri.o \ ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ - ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ + ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ @@ -252,7 +252,7 @@ CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o endif -ZCLASRC = cpotrs.o +ZCLASRC = cpotrs.o DLASRC = \ dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ @@ -261,7 +261,7 @@ DLASRC = \ dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ dgeqp3.o dgeqpf.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ - dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o \ + dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesvd.o dgesvx.o \ dgetc2.o dgetri.o \ dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ dggglm.o dgghrd.o dgglse.o dggqrf.o \ @@ -279,8 +279,8 @@ DLASRC = \ dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ dlargv.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlasy2.o dlasyf.o dlasyf_rook.o \ - dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o \ - dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ + dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ + dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ @@ -307,7 +307,7 @@ DLASRC = \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtptrs.o \ dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ - dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ + dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ @@ -335,8 +335,8 @@ ZLASRC = \ zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ zgeqpf.o zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ - zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o \ - zgetri.o \ + zgesc2.o zgesdd.o zgesvd.o zgesvx.o zgetc2.o \ + zgetri.o \ zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ zgghrd.o zgglse.o zggqrf.o zggrqf.o \ zggsvd.o zggsvp.o \ @@ -370,7 +370,7 @@ ZLASRC = \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ zlassq.o zlasyf.o zlasyf_rook.o \ zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ - zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ + zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ zposv.o zposvx.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ @@ -387,7 +387,7 @@ ZLASRC = \ ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ ztprfs.o ztptri.o \ ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ - ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ + ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ @@ -417,8 +417,6 @@ endif ALLOBJ = $(SLASRC) $(DLASRC) $(DSLASRC) $(CLASRC) $(ZLASRC) $(ZCLASRC) \ $(SCLAUX) $(DZLAUX) $(ALLAUX) -ALLOBJ_P = $(ALLOBJ:.o=.$(PSUFFIX)) - ifdef USEXBLAS ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) endif @@ -435,6 +433,7 @@ lapacklib: $(ALLOBJ) $(ALLXOBJ) $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) $(RANLIB) $@ + single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) @@ -483,16 +482,11 @@ clean: %.$(PSUFFIX): %.f $(FORTRAN) $(POPTS) -c $< -o $@ -slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ -slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ -dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ -sla_wwaddw.$(PSUFFIX): sla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ -dla_wwaddw.$(PSUFFIX): dla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ -cla_wwaddw.$(PSUFFIX): cla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ -zla_wwaddw.$(PSUFFIX): zla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ + diff --git a/lapack-netlib/TESTING/ctest_rfp.in b/lapack-netlib/TESTING/ctest_rfp.in index d6988f2a7..612bd8efb 100644 --- a/lapack-netlib/TESTING/ctest_rfp.in +++ b/lapack-netlib/TESTING/ctest_rfp.in @@ -5,5 +5,5 @@ Data file for testing COMPLEX LAPACK linear equation routines RFP format 1 2 15 Values of NRHS (number of right hand sides) 9 Number of matrix types (list types on next line if 0 < NTYPES < 9) 1 2 3 4 5 6 7 8 9 Matrix Types -30.0 Threshold value of test ratio +50.0 Threshold value of test ratio T Put T to test the error exits diff --git a/lapack-netlib/TESTING/svd.in b/lapack-netlib/TESTING/svd.in index bc0ae2d2e..1de42807d 100644 --- a/lapack-netlib/TESTING/svd.in +++ b/lapack-netlib/TESTING/svd.in @@ -7,7 +7,7 @@ SVD: Data file for testing Singular Value Decomposition routines 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) 2 0 2 2 2 Values of NRHS -50.0 Threshold value +54.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits diff --git a/lapack/Makefile b/lapack/Makefile index 215badb74..f99416fa6 100644 --- a/lapack/Makefile +++ b/lapack/Makefile @@ -1,7 +1,8 @@ TOPDIR = .. include ../Makefile.system -SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs +#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs +SUBDIRS = getrf getf2 laswp getrs potrf potf2 FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 diff --git a/make.inc b/make.inc index 01b9bde92..d3f91cbaa 100644 --- a/make.inc +++ b/make.inc @@ -5,7 +5,7 @@ LOADER = $(FORTRAN) TIMER = NONE ARCHFLAGS= -ru #RANLIB = ranlib -BLASLIB = +BLASLIB = ../../../libopenblas.a TMGLIB = tmglib.a -EIGSRCLIB = eigsrc.a -LINSRCLIB = linsrc.a +#EIGSRCLIB = eigsrc.a +#LINSRCLIB = linsrc.a diff --git a/param.h b/param.h index aae648f8c..ae40ac1d7 100644 --- a/param.h +++ b/param.h @@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 @@ -1104,10 +1104,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 @@ -2021,6 +2021,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(ARMV5) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC