diff --git a/Changelog.txt b/Changelog.txt index ae2a77e5a..e122300ec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.1.0 +23-Mar-2012 +common: + * Set soname of shared library on Linux. + * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use + this flag to control the library name, e.g. libopenblas.a, + libopenblas_ifort.a or libopenblas_omp.a. + * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. + The lib use single thread in GEMM function with small matrices. +x86/x86_64: + * Used GEMV SSE/SSE2 kernels on x86 32-bit. + * Exported CBLAS functions in Windows DLL. +MIPS64: + * Completed Level-3 BLAS optimization on Loongson 3A CPU. + * Improved GEMV performance on Loongson 3A CPU. + * Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT) + ==================================================================== Version 0.1 alpha2.5 19-Feb-2012 diff --git a/Makefile.rule b/Makefile.rule index f7d60b052..650478a07 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1alpha2.5 +VERSION = 0.1.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index b8b9ba837..0fd223d60 100644 --- a/Makefile.system +++ b/Makefile.system @@ -279,7 +279,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -march=mips64 +FCOMMON_OPT += -march=mips64 +endif + +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -534,8 +539,10 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) +ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif +endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -600,9 +607,11 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) +ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif +endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY diff --git a/README b/README index 79ab48d8b..c8c2c2c55 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). diff --git a/common_linux.h b/common_linux.h index 8b3d44bfa..b0381d991 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { +#if defined (LOONGSON3B) +#if defined (__64BIT__) + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else + return 0; //NULL Implementation on Loongson 3B 32bit. +#endif +#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 - unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +// unsigned long null_nodemask=0; + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_macro.h b/common_macro.h index bcaa9f38b..0c34ecb01 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2127,7 +2127,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) +extern BLASLONG gemm_offset_a; +extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_mips64.h b/common_mips64.h index acea79011..85348377e 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) - unsigned long long tmp; - __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); - ret=tmp; +#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; + //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); + //ret=tmp; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $2\n" + ".set pop": "=r"(ret):: "memory"); + #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){ return ret; } +#if defined(LOONGSON3A) || defined(LOONGSON3B) +#ifndef NO_AFFINITY +#define WHEREAMI +static inline int WhereAmI(void){ + int ret=0; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $0\n" + ".set pop": "=r"(ret):: "memory"); + return ret; + +} +#endif +#endif + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } @@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d +#define NEG neg.d #else #define LD lwc1 #define ST swc1 @@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl +#define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) @@ -218,13 +247,18 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 8 << 20) +#define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif +#if defined(LOONGSON3B) +#define PAGESIZE (32UL << 10) +#define FIXED_PAGESIZE (32UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif @@ -236,7 +270,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips.c b/cpuid_mips.c index f50a4ec3e..217492dd7 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 static char *cpuname[] = { "UNKOWN", "SICORTEX", - "LOONGSON3A" + "LOONGSON3A", + "LOONGSON3B" }; int detect(void){ @@ -101,6 +103,8 @@ int detect(void){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ @@ -130,6 +134,8 @@ void get_architecture(void){ void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); }else{ printf("SICORTEX"); } @@ -149,6 +155,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -164,6 +179,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); }else{ #ifdef __mips64 printf("mips64\n"); diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 321e88f0c..b81c6fa40 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index ba54612eb..f9007f831 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = NULL; +#if defined(LOONGSON3A) + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; +#else + queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; +#endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { +#if defined(LOONGSON3A) queue[0].sa = sa; - queue[0].sb = sb; - + queue[0].sb = sa + GEMM_OFFSET_A1 * 5; +#else + queue[0].sa = sa; + queue[0].sb = sb; +#endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9d83e950a..9ffe17040 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -55,8 +55,8 @@ int CNAME(int mode, range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c0f77c4c9..66067a05c 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; + int ret; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -545,12 +546,16 @@ int blas_thread_init(void){ pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif + if(ret!=0){ + fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); + exit(1); + } } #ifdef MONITOR @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4fd4cd440..c45856fd9 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/memory.c b/driver/others/memory.c index dd8334477..3f1a5f60a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG - int ret; + int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; - perror("alloc_mmap:"); + perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9e72fd24f..21f56e889 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -45,8 +45,22 @@ int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 +#define DEFAULT_GEMM_OFFSET_A 0 +#define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ +#if GEMM_OFFSET_A == gemm_offset_a +BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; +#else +BLASLONG gemm_offset_a = GEMM_OFFSET_A; +#endif + +#if GEMM_OFFSET_B == gemm_offset_b +BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; +#else +BLASLONG gemm_offset_b = GEMM_OFFSET_B; +#endif + #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -666,3 +680,36 @@ void blas_set_parameter(void){ #endif #endif + +#if defined(ARCH_MIPS64) +void blas_set_parameter(void){ +#if defined(LOONGSON3A) +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + dgemm_r = 1024; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 200; + } +#endif +#endif + +#if defined(LOONGSON3B) +#ifdef SMP + if(blas_num_threads == 1 || blas_num_threads == 2){ +#endif + //single thread + dgemm_r = 640; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 160; + } +#endif +#endif + +} +#endif diff --git a/getarch.c b/getarch.c index df052df8a..5b614472a 100644 --- a/getarch.c +++ b/getarch.c @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3B +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "LOONGSON3B" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DLOONGSON3B " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "loongson3b" +#define CORENAME "LOONGSON3B" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" diff --git a/interface/symm.c b/interface/symm.c index a0d52c49d..b447f13e8 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4e331a445..4f419dc80 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifeq ($(TARGET), LOONGSON3B) +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c new file mode 100644 index 000000000..3645ef154 --- /dev/null +++ b/kernel/generic/gemmkernel_2x2.c @@ -0,0 +1,157 @@ +#include "common.h" +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + for (j=0; j