diff --git a/Makefile.system b/Makefile.system index b8b9ba837..0fd223d60 100644 --- a/Makefile.system +++ b/Makefile.system @@ -279,7 +279,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -march=mips64 +FCOMMON_OPT += -march=mips64 +endif + +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -534,8 +539,10 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) +ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif +endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -600,9 +607,11 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) +ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif +endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY diff --git a/common_linux.h b/common_linux.h index 8b3d44bfa..8d9019a0d 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { +#if defined (LOONGSON3B) +#if defined (__64BIT__) + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else + return 0; //NULL Implementation on Loongson 3B 32bit. +#endif +#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_mips64.h b/common_mips64.h index 35d8265bc..85348377e 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) - unsigned long long tmp; - __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); - ret=tmp; +#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; + //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); + //ret=tmp; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $2\n" + ".set pop": "=r"(ret):: "memory"); + #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){ return ret; } +#if defined(LOONGSON3A) || defined(LOONGSON3B) +#ifndef NO_AFFINITY +#define WHEREAMI +static inline int WhereAmI(void){ + int ret=0; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $0\n" + ".set pop": "=r"(ret):: "memory"); + return ret; + +} +#endif +#endif + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } @@ -234,6 +254,11 @@ REALNAME: ;\ #define FIXED_PAGESIZE (16UL << 10) #endif +#if defined(LOONGSON3B) +#define PAGESIZE (32UL << 10) +#define FIXED_PAGESIZE (32UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif @@ -245,7 +270,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips.c b/cpuid_mips.c index f50a4ec3e..217492dd7 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 static char *cpuname[] = { "UNKOWN", "SICORTEX", - "LOONGSON3A" + "LOONGSON3A", + "LOONGSON3B" }; int detect(void){ @@ -101,6 +103,8 @@ int detect(void){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ @@ -130,6 +134,8 @@ void get_architecture(void){ void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); }else{ printf("SICORTEX"); } @@ -149,6 +155,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -164,6 +179,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); }else{ #ifdef __mips64 printf("mips64\n"); diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 321e88f0c..b81c6fa40 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9d83e950a..9ffe17040 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -55,8 +55,8 @@ int CNAME(int mode, range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index ac9c87850..feb45eb58 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){ if (map_address != (void *)-1) { #ifdef OS_LINUX -#ifdef DEBUG - int ret; +#if 1 + //#ifdef DEBUG + int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; - perror("alloc_mmap:"); + perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index fc7f0447e..21f56e889 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -696,5 +696,20 @@ void blas_set_parameter(void){ } #endif #endif + +#if defined(LOONGSON3B) +#ifdef SMP + if(blas_num_threads == 1 || blas_num_threads == 2){ +#endif + //single thread + dgemm_r = 640; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 160; + } +#endif +#endif + } #endif diff --git a/getarch.c b/getarch.c index df052df8a..5b614472a 100644 --- a/getarch.c +++ b/getarch.c @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3B +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "LOONGSON3B" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DLOONGSON3B " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "loongson3b" +#define CORENAME "LOONGSON3B" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4e331a445..4f419dc80 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifeq ($(TARGET), LOONGSON3B) +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c new file mode 100644 index 000000000..3645ef154 --- /dev/null +++ b/kernel/generic/gemmkernel_2x2.c @@ -0,0 +1,157 @@ +#include "common.h" +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + for (j=0; j