diff --git a/Changelog.txt b/Changelog.txt index ae2a77e5a..e122300ec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.1.0 +23-Mar-2012 +common: + * Set soname of shared library on Linux. + * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use + this flag to control the library name, e.g. libopenblas.a, + libopenblas_ifort.a or libopenblas_omp.a. + * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. + The lib use single thread in GEMM function with small matrices. +x86/x86_64: + * Used GEMV SSE/SSE2 kernels on x86 32-bit. + * Exported CBLAS functions in Windows DLL. +MIPS64: + * Completed Level-3 BLAS optimization on Loongson 3A CPU. + * Improved GEMV performance on Loongson 3A CPU. + * Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT) + ==================================================================== Version 0.1 alpha2.5 19-Feb-2012 diff --git a/Makefile b/Makefile index 2f56480f9..ba04aa989 100644 --- a/Makefile +++ b/Makefile @@ -82,27 +82,28 @@ endif shared : ifeq ($(OSNAME), Linux) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) $(MAKE) -C exports dyn - -ln -fs $(LIBDYNNAME) libopenblas.dylib + -ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif tests : @@ -130,7 +131,7 @@ endif ifeq ($(NOFORTRAN), 1) $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) endif - -ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + -ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -158,7 +159,7 @@ endif prof : prof_blas prof_lapack prof_blas : - ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d prof || exit 1 ; \ @@ -169,7 +170,7 @@ ifdef DYNAMIC_ARCH endif blas : - ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ @@ -177,7 +178,7 @@ blas : done hpl : - ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -191,7 +192,7 @@ ifdef DYNAMIC_ARCH endif hpl_p : - ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -285,7 +286,8 @@ clean :: #ifdef DYNAMIC_ARCH @$(MAKE) -C kernel clean #endif - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h + @$(MAKE) -C reference clean + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d lapack-3.4.0; then \ echo deleting lapack-3.4.0; \ diff --git a/Makefile.install b/Makefile.install index 2778a491f..46105fc39 100644 --- a/Makefile.install +++ b/Makefile.install @@ -38,33 +38,34 @@ install : lib.grd #for install static library @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) - @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.$(LIBSUFFIX) + @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX) #for install shared library @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), Linux) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) -cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR) -install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dylib + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif @echo Install OK! diff --git a/Makefile.rule b/Makefile.rule index db1a48d9f..650478a07 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,12 @@ # # This library's version -VERSION = 0.1alpha2.5 +VERSION = 0.1.0 + +# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a +# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library +# is libopenblas_$(LIBNAMESUFFIX).so.0. +# LIBNAMESUFFIX = omp # You can specify the target architecture, otherwise it's # automatically detected. @@ -83,6 +88,11 @@ VERSION = 0.1alpha2.5 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 +# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# with single thread. You can use this flag to avoid the overhead of multi-threading +# in small matrix sizes. The default value is 4. +# GEMM_MULTITHREAD_THRESHOLD = 4 + # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 diff --git a/Makefile.system b/Makefile.system index e8ba3694e..0fd223d60 100644 --- a/Makefile.system +++ b/Makefile.system @@ -40,6 +40,11 @@ ifdef INTERFACE64 GETARCH_FLAGS += -DUSE64BITINT endif +ifndef GEMM_MULTITHREAD_THRESHOLD +GEMM_MULTITHREAD_THRESHOLD=4 +endif +GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 @@ -274,7 +279,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -march=mips64 +FCOMMON_OPT += -march=mips64 +endif + +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -341,7 +351,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT -FCOMMON_OPT += -Wall +FCOMMON_OPT += -Wall +EXTRALIB += -lgfortran ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) ifdef BINARY64 @@ -528,8 +539,10 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) +ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif +endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -568,7 +581,11 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif +ifndef LIBNAMESUFFIX LIBPREFIX = libopenblas +else +LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) +endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) @@ -590,9 +607,11 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) +ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif +endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY @@ -636,6 +655,7 @@ MD5SUM = md5sum AWK = awk REVISION = -r$(VERSION) +MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) diff --git a/README b/README index 79ab48d8b..c8c2c2c55 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). diff --git a/common_linux.h b/common_linux.h index 8b3d44bfa..b0381d991 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { +#if defined (LOONGSON3B) +#if defined (__64BIT__) + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else + return 0; //NULL Implementation on Loongson 3B 32bit. +#endif +#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 - unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +// unsigned long null_nodemask=0; + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_macro.h b/common_macro.h index bcaa9f38b..0c34ecb01 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2127,7 +2127,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) +extern BLASLONG gemm_offset_a; +extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_mips64.h b/common_mips64.h index acea79011..85348377e 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) - unsigned long long tmp; - __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); - ret=tmp; +#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; + //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); + //ret=tmp; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $2\n" + ".set pop": "=r"(ret):: "memory"); + #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){ return ret; } +#if defined(LOONGSON3A) || defined(LOONGSON3B) +#ifndef NO_AFFINITY +#define WHEREAMI +static inline int WhereAmI(void){ + int ret=0; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $0\n" + ".set pop": "=r"(ret):: "memory"); + return ret; + +} +#endif +#endif + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } @@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d +#define NEG neg.d #else #define LD lwc1 #define ST swc1 @@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl +#define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) @@ -218,13 +247,18 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 8 << 20) +#define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif +#if defined(LOONGSON3B) +#define PAGESIZE (32UL << 10) +#define FIXED_PAGESIZE (32UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif @@ -236,7 +270,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips.c b/cpuid_mips.c index f50a4ec3e..217492dd7 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 static char *cpuname[] = { "UNKOWN", "SICORTEX", - "LOONGSON3A" + "LOONGSON3A", + "LOONGSON3B" }; int detect(void){ @@ -101,6 +103,8 @@ int detect(void){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ @@ -130,6 +134,8 @@ void get_architecture(void){ void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); }else{ printf("SICORTEX"); } @@ -149,6 +155,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -164,6 +179,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); }else{ #ifdef __mips64 printf("mips64\n"); diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 321e88f0c..b81c6fa40 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index ba54612eb..f9007f831 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = NULL; +#if defined(LOONGSON3A) + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; +#else + queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; +#endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { +#if defined(LOONGSON3A) queue[0].sa = sa; - queue[0].sb = sb; - + queue[0].sb = sa + GEMM_OFFSET_A1 * 5; +#else + queue[0].sa = sa; + queue[0].sb = sb; +#endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9d83e950a..9ffe17040 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -55,8 +55,8 @@ int CNAME(int mode, range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c0f77c4c9..66067a05c 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; + int ret; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -545,12 +546,16 @@ int blas_thread_init(void){ pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif + if(ret!=0){ + fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); + exit(1); + } } #ifdef MONITOR @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4fd4cd440..c45856fd9 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/memory.c b/driver/others/memory.c index dd8334477..3f1a5f60a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG - int ret; + int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; - perror("alloc_mmap:"); + perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9e72fd24f..21f56e889 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -45,8 +45,22 @@ int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 +#define DEFAULT_GEMM_OFFSET_A 0 +#define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ +#if GEMM_OFFSET_A == gemm_offset_a +BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; +#else +BLASLONG gemm_offset_a = GEMM_OFFSET_A; +#endif + +#if GEMM_OFFSET_B == gemm_offset_b +BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; +#else +BLASLONG gemm_offset_b = GEMM_OFFSET_B; +#endif + #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -666,3 +680,36 @@ void blas_set_parameter(void){ #endif #endif + +#if defined(ARCH_MIPS64) +void blas_set_parameter(void){ +#if defined(LOONGSON3A) +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + dgemm_r = 1024; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 200; + } +#endif +#endif + +#if defined(LOONGSON3B) +#ifdef SMP + if(blas_num_threads == 1 || blas_num_threads == 2){ +#endif + //single thread + dgemm_r = 640; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 160; + } +#endif +#endif + +} +#endif diff --git a/exports/Makefile b/exports/Makefile index 69050989c..873e8b270 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -58,16 +58,16 @@ dll : ../$(LIBDLLNAME) dll2 : libgoto2_shared.dll -../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) +../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) - $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) - -lib /machine:i386 /def:libgoto2.def + -lib /machine:i386 /def:libopenblas.def else - $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) - -lib /machine:X64 /def:libgoto2.def + -lib /machine:X64 /def:libopenblas.def endif libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def @@ -75,7 +75,7 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) -libgoto2.def : gensymbol +libopenblas.def : gensymbol perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) libgoto2_shared.def : gensymbol @@ -100,7 +100,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest diff --git a/exports/gensymbol b/exports/gensymbol index 3d8d74dde..6b2a00672 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -301,7 +301,7 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 @objs = (@blasobjs); -} elsif (-d "../lapack-3.1.1") { +} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") { @objs = (@blasobjs, @lapackobjs, @lapackobjs2); } else { @objs = (@blasobjs, @lapackobjs); @@ -389,6 +389,13 @@ if ($ARGV[0] eq "win2k"){ $count ++; } + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print "\t",$objs,"=$objs"," \@", $count, "\n"; + $count ++; + } + } + exit(0); } diff --git a/f_check b/f_check index 45a946eb6..f5bb5a7f6 100644 --- a/f_check +++ b/f_check @@ -284,6 +284,10 @@ if ($link ne "") { } +if ($vendor eq "INTEL"){ + $linker_a .= "-lgfortran" +} + open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; open(CONFFILE, ">> $config" ) || die "Can't append $config"; diff --git a/getarch.c b/getarch.c index df052df8a..5b614472a 100644 --- a/getarch.c +++ b/getarch.c @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3B +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "LOONGSON3B" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DLOONGSON3B " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "loongson3b" +#define CORENAME "LOONGSON3B" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" diff --git a/getarch_2nd.c b/getarch_2nd.c index 018f08d31..5339af442 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,6 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); } return 0; diff --git a/interface/Makefile b/interface/Makefile index 6764daa95..5cf11cd9b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -770,20 +770,36 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) +ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< +else +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< +endif qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< - + +ifndef USE_NETLIB_GEMV cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< +else +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< + +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< +endif xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< diff --git a/interface/gemm.c b/interface/gemm.c index 7919f822e..28cf5372d 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); args.common = NULL; - args.nthreads = num_cpu_avail(3); + if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD + || args.k <=GEMM_MULTITHREAD_THRESHOLD){ + args.nthreads = 1; + }else{ + args.nthreads = num_cpu_avail(3); + } if (args.nthreads == 1) { #endif diff --git a/interface/netlib/cgemv.f b/interface/netlib/cgemv.f new file mode 100644 index 000000000..d9e55f9a2 --- /dev/null +++ b/interface/netlib/cgemv.f @@ -0,0 +1,285 @@ + SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or +* +* y := alpha*A**H*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY + LOGICAL NOCONJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* + NOCONJ = LSAME(TRANS,'T') +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = ZERO + IF (NOCONJ) THEN + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + ELSE + DO 100 I = 1,M + TEMP = TEMP + CONJG(A(I,J))*X(I) + 100 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140 J = 1,N + TEMP = ZERO + IX = KX + IF (NOCONJ) THEN + DO 120 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130 I = 1,M + TEMP = TEMP + CONJG(A(I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END diff --git a/interface/netlib/dgemv.f b/interface/netlib/dgemv.f new file mode 100644 index 000000000..a41259412 --- /dev/null +++ b/interface/netlib/dgemv.f @@ -0,0 +1,265 @@ + SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* DGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE,ZERO + PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = ZERO + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120 J = 1,N + TEMP = ZERO + IX = KX + DO 110 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMV . +* + END diff --git a/interface/netlib/sgemv.f b/interface/netlib/sgemv.f new file mode 100644 index 000000000..afae26980 --- /dev/null +++ b/interface/netlib/sgemv.f @@ -0,0 +1,265 @@ + SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + REAL ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + REAL A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* SGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE,ZERO + PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('SGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = ZERO + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120 J = 1,N + TEMP = ZERO + IX = KX + DO 110 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMV . +* + END diff --git a/interface/netlib/zgemv.f b/interface/netlib/zgemv.f new file mode 100644 index 000000000..bb2ae4fcb --- /dev/null +++ b/interface/netlib/zgemv.f @@ -0,0 +1,285 @@ + SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* ZGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or +* +* y := alpha*A**H*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY + LOGICAL NOCONJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* + NOCONJ = LSAME(TRANS,'T') +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = ZERO + IF (NOCONJ) THEN + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + ELSE + DO 100 I = 1,M + TEMP = TEMP + DCONJG(A(I,J))*X(I) + 100 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140 J = 1,N + TEMP = ZERO + IX = KX + IF (NOCONJ) THEN + DO 120 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130 I = 1,M + TEMP = TEMP + DCONJG(A(I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMV . +* + END diff --git a/interface/symm.c b/interface/symm.c index a0d52c49d..b447f13e8 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4e331a445..4f419dc80 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifeq ($(TARGET), LOONGSON3B) +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c new file mode 100644 index 000000000..3645ef154 --- /dev/null +++ b/kernel/generic/gemmkernel_2x2.c @@ -0,0 +1,157 @@ +#include "common.h" +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + for (j=0; j