diff --git a/Changelog.txt b/Changelog.txt new file mode 100644 index 000000000..b3c438471 --- /dev/null +++ b/Changelog.txt @@ -0,0 +1,27 @@ +OpenBLAS ChangeLog +==================================================================== +Version 0.1 (in development) +26-Feb-2011 + +common: + * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. + Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github) + * Added DEBUG=1 rule in Makefile.rule to build debug version. + * Disable compiling quad precision in reference BLAS library(netlib BLAS). + * Added unit testcases in utest/ subdir. Used CUnit framework. + * Supported OPENBLAS_* & GOTO_* environment variables (Pleas see README) + * Imported GotoBLAS2 1.13 BSD version + +x86/x86 64: + * Modified ?axpy functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #7 on github) + * Modified ?swap functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #6 on github) + * Modified ?rot functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #4 on github) + * Detect Intel Westmere to use Nehalem codes. + * Fixed a typo bug about compiling dynamic ARCH library. +MIPS64: + * Improve daxpy performance on ICT Loongson 3A. + * Supported ICT Loongson 3A CPU (Refs issue #1 on github) +==================================================================== diff --git a/Makefile.rule b/Makefile.rule index ecafe0cdc..d9013dd83 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -70,7 +70,7 @@ VERSION = 0.1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by GOTO_THREAD_TIMEOUT +# system). Also you can control this mumber by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 # Using special device driver for mapping physically contigous memory @@ -89,7 +89,13 @@ VERSION = 0.1 # UTEST_CHECK = 1 # Common Optimization Flag; -O2 is enough. +# DEBUG = 1 + +ifeq ($(DEBUG), 1) +COMMON_OPT += -g -DDEBUG +else COMMON_OPT += -O2 +endif # Profiling flags COMMON_PROF = -pg diff --git a/README b/README index 23cbdba72..9b04f6f99 100644 --- a/README +++ b/README @@ -4,6 +4,8 @@ OpenBLAS Readme OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) 2.Intallation +Download from project homepage. http://xianyi.github.com/OpenBLAS/ +Or, check out codes from git://github.com/xianyi/OpenBLAS.git 1)Normal compile Please read GotoBLAS_02QuickInstall.txt or type "make" @@ -15,23 +17,43 @@ examples: On X86 box, compile this library for loongson3a CPU. make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A +3)Debug version +make DEBUG=1 + 3.Support CPU & OS Please read GotoBLAS_01Readme.txt +Additional support CPU: +x86_64: + Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. +MIPS64: + ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. + 4.Usages Link with libopenblas.a or -lopenblas for shared library. -Set the number of threads. for example, +4.1 Set the number of threads with environment variables. for example, export OPENBLAS_NUM_THREADS=4 + or +export GOTO_NUM_THREADS=4 or export OMP_NUM_THREADS=4 -OPENBLAS_NUM_THREAD is prior to OMP_NUM_THREADS. + +The priorities are OPENBLAS_NUM_THREAD > GOTO_NUM_THREADS > OMP_NUM_THREADS. + +4.2 Set the number of threads with calling functions. for example, +void goto_set_num_threads(int num_threads); +or +void openblas_set_num_threads(int num_threads); 5.Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues 6.To-Do List: -Support ICT Loongson 3A CPU +Optimization on ICT Loongson 3A CPU 7.Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas + +8.ChangeLog +Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. \ No newline at end of file diff --git a/common_linux.h b/common_linux.h index d18cd2b72..8b3d44bfa 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,8 +68,9 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { - - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 + unsigned long null_nodemask=0; + return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_reference.h b/common_reference.h index d4dca859e..04b11f80f 100644 --- a/common_reference.h +++ b/common_reference.h @@ -43,4 +43,21 @@ void BLASFUNC_REF(csrot) (blasint *, float *, blasint *, float *, blasint *, void BLASFUNC_REF(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC_REF(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); +void BLASFUNC_REF(sswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(dswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC_REF(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); + +float _Complex BLASFUNC_REF(cdotu) (blasint *, float *, blasint *, float *, blasint *); +float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *, blasint *); +double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); +double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); + #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 288754497..98f744330 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -972,8 +972,15 @@ int get_cpuname(void){ return CPUTYPE_ATOM; case 13: return CPUTYPE_DUNNINGTON; - break; } + break; + case 2: + switch (model) { + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -1289,8 +1296,16 @@ int get_coretype(void){ return CORE_ATOM; case 13: return CORE_DUNNINGTON; - break; } + break; + case 2: + switch (model) { + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CORE_NEHALEM; + } + break; + } case 15: if (model <= 0x2) return CORE_NORTHWOOD; diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index 8c5473c03..df4d723ab 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -297,7 +297,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, printf("GEMM: SA .. %p SB .. %p\n", sa, sb); #endif -#ifdef DEBUG +#ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 62b310aba..20e811cd0 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -278,7 +278,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); #endif -#ifdef DEBUG +#ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 11f058e96..c0f77c4c9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -525,7 +525,16 @@ int blas_thread_init(void){ if (thread_timeout < 4) thread_timeout = 4; if (thread_timeout > 30) thread_timeout = 30; thread_timeout = (1 << thread_timeout); - } + }else{ + p = getenv("GOTO_THREAD_TIMEOUT"); + if (p) { + thread_timeout = atoi(p); + if (thread_timeout < 4) thread_timeout = 4; + if (thread_timeout > 30) thread_timeout = 30; + thread_timeout = (1 << thread_timeout); + } + } + for(i = 0; i < blas_num_threads - 1; i++){ @@ -790,6 +799,11 @@ void goto_set_num_threads(int num_threads) { } +void openblas_set_num_threads(int num_threads) { + goto_set_num_threads(num_threads); + +} + /* Compatible function with pthread_create / join */ int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4e27717fc..8288f33aa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -121,6 +121,11 @@ static gotoblas_t *get_coretype(void){ if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; if (model == 12) return &gotoblas_ATOM; return NULL; + + case 2: + //Intel Xeon Processor 5600 (Westmere-EP) + if (model == 12) return &gotoblas_NEHALEM; + return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; diff --git a/driver/others/init.c b/driver/others/init.c index 94f883728..7ee7dc45d 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -92,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map" #define NODE_DIR "/sys/devices/system/node" -#undef DEBUG +//#undef DEBUG /* Private variables */ typedef struct { @@ -581,6 +581,7 @@ void gotoblas_affinity_init(void) { numprocs = 0; #else numprocs = readenv("OPENBLAS_NUM_THREADS"); + if (numprocs == 0) numprocs = readenv("GOTO_NUM_THREADS"); #endif if (numprocs == 0) numprocs = readenv("OMP_NUM_THREADS"); @@ -666,7 +667,7 @@ void gotoblas_affinity_init(void) { setup_mempolicy(); - if (readenv("OPENBLAS_MAIN_FREE")) { + if (readenv("OPENBLAS_MAIN_FREE") || readenv("GOTOBLAS_MAIN_FREE")) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); } diff --git a/driver/others/memory.c b/driver/others/memory.c index fc5265715..dd8334477 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -68,9 +68,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#undef DEBUG +//#undef DEBUG #include "common.h" +#include #ifdef OS_WINDOWS #define ALLOC_WINDOWS @@ -231,6 +232,13 @@ int blas_get_cpu_number(void){ p = getenv("OPENBLAS_NUM_THREADS"); if (p) blas_goto_num = atoi(p); if (blas_goto_num < 0) blas_goto_num = 0; + + if (blas_goto_num == 0) { + p = getenv("GOTO_NUM_THREADS"); + if (p) blas_goto_num = atoi(p); + if (blas_goto_num < 0) blas_goto_num = 0; + } + #endif blas_omp_num = 0; @@ -379,11 +387,23 @@ static void *alloc_mmap(void *address){ MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { - + #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#ifdef DEBUG + int ret; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif #endif + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; @@ -979,7 +999,7 @@ void *blas_memory_alloc(int procpos){ memory[position].addr = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_area[position], position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } @@ -1010,7 +1030,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf("Mapped : %p %3d\n\n", - (void *)alloc_area[position], position); + (void *)memory[position].addr, position); #endif return (void *)memory[position].addr; @@ -1053,7 +1073,7 @@ void blas_memory_free(void *free_area){ #ifdef DEBUG for (position = 0; position < NUM_BUFFERS; position++) - printf("%4ld %p : %d\n", position, alloc_area[position], alloc_used[position]); + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif return; diff --git a/interface/axpy.c b/interface/axpy.c index 03b981985..dd75b758c 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -81,6 +81,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/swap.c b/interface/swap.c index 7676246f9..271fa083a 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -78,7 +78,12 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP nthreads = num_cpu_avail(1); - + + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index d3355ea57..9ed72efb9 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -83,6 +83,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index f4a03a550..06a889204 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -80,6 +80,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S index 291a219ce..e06d90184 100644 --- a/kernel/x86/axpy_sse.S +++ b/kernel/x86/axpy_sse.S @@ -1440,6 +1440,12 @@ .L50: movl M, %eax movl Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 + sarl $3, %eax jle .L55 ALIGN_3 diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S index 5e31d3dba..9b2d5d808 100644 --- a/kernel/x86/axpy_sse2.S +++ b/kernel/x86/axpy_sse2.S @@ -698,6 +698,12 @@ .L40: movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L46 + cmpl $0, INCY + je .L46 + sarl $3, %eax jle .L45 ALIGN_3 diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S index 8ec1d44bb..e9c5ba1ef 100644 --- a/kernel/x86/rot_sse2.S +++ b/kernel/x86/rot_sse2.S @@ -859,6 +859,10 @@ .L50: movl N, I + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 sarl $2, I jle .L55 ALIGN_3 diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S index edd9929cd..9c94cec44 100644 --- a/kernel/x86/zaxpy_sse.S +++ b/kernel/x86/zaxpy_sse.S @@ -2857,6 +2857,11 @@ unpcklps ALPHA_I, ALPHA_R unpcklps %xmm5, ALPHA_I #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L200 + cmpl $0, INCY + je .L200 movl Y, YY @@ -3090,8 +3095,41 @@ addps %xmm1, %xmm4 movsd %xmm4, (Y) + jmp .L999 ALIGN_3 +.L200: + movl M, %eax + cmpl $0, %eax + jle .L999 + ALIGN_3 + +.L201: + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (Y) + + decl %eax + jg .L201 + + ALIGN_3 .L999: popl %ebp popl %ebx diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S index 40afdc3fc..9c2caa7e8 100644 --- a/kernel/x86/zaxpy_sse2.S +++ b/kernel/x86/zaxpy_sse2.S @@ -1318,6 +1318,12 @@ movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L58 + cmpl $0, INCY + je .L58 + sarl $2, %eax jle .L55 @@ -1498,6 +1504,7 @@ andl $1, %eax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1510,6 +1517,10 @@ movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) + + + decl %eax + jg .L58 ALIGN_3 .L999: diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S index d8d01009e..d10183f73 100644 --- a/kernel/x86/zrot_sse.S +++ b/kernel/x86/zrot_sse.S @@ -1285,6 +1285,12 @@ .L50: movl N, I +//if incx ==0 || incy==0 jump to the tail + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 + sarl $2, I jle .L55 ALIGN_3 diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 23c2ec54e..9a7512575 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -1463,6 +1463,12 @@ .L50: movq M, %rax movq Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 + sarq $3, %rax jle .L55 ALIGN_3 diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index 554602917..dea8d0382 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -805,6 +805,12 @@ .L40: movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L46 + cmpq $0, INCY + je .L46 + sarq $3, %rax jle .L45 ALIGN_3 diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S index 505554707..502940324 100644 --- a/kernel/x86_64/rot_sse2.S +++ b/kernel/x86_64/rot_sse2.S @@ -887,6 +887,10 @@ .L50: movq N, %rax + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 sarq $2, %rax jle .L55 ALIGN_3 diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S index 69cdedaaa..42b920cfb 100644 --- a/kernel/x86_64/zaxpy_sse.S +++ b/kernel/x86_64/zaxpy_sse.S @@ -2893,6 +2893,12 @@ unpcklps %xmm13, %xmm15 #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L200 + cmpq $0, INCY + je .L200 + movq Y, YY movq M, %rax @@ -3105,8 +3111,42 @@ addps %xmm1, %xmm8 movsd %xmm8, (Y) + jmp .L999 ALIGN_3 + +.L200: + movq M, %rax + cmpq $0, %rax + jle .L999 + ALIGN_3 +.L201: + movsd (X), %xmm0 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (Y) + addq INCY, Y + + decq %rax + jg .L201 + ALIGN_3 + .L999: xorq %rax, %rax diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index f1616e362..1b7e3a563 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1416,6 +1416,12 @@ movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L58 + cmpq $0, INCY + je .L58 + sarq $3, %rax jle .L55 @@ -1769,6 +1775,7 @@ andq $1, %rax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1781,6 +1788,9 @@ movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) + + decq %rax + jg .L58 ALIGN_3 .L999: diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S index 4aa0e7211..da79b4abe 100644 --- a/kernel/x86_64/zrot_sse.S +++ b/kernel/x86_64/zrot_sse.S @@ -1523,6 +1523,10 @@ .L50: movq N, %rax + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 sarq $2, %rax jle .L55 ALIGN_3 diff --git a/reference/Makefile b/reference/Makefile index 6cbde28ef..034f23244 100644 --- a/reference/Makefile +++ b/reference/Makefile @@ -138,7 +138,8 @@ DBLASOBJS += \ dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ -QBLASOBJS += \ +QBLASOBJS += +# \ qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ qlaswpf.$(SUFFIX) qgetrsf.$(SUFFIX) qgesvf.$(SUFFIX) qpotrif.$(SUFFIX) \ @@ -153,7 +154,8 @@ ZBLASOBJS += \ zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ -XBLASOBJS += \ +XBLASOBJS += +# \ xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ diff --git a/utest/Makefile b/utest/Makefile index 8b65a8d9d..9d512b877 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o all : run_test $(TARGET): $(OBJS) - $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) + $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) run_test: $(TARGET) ./$(TARGET) diff --git a/utest/common_utest.h b/utest/common_utest.h index 7d43b1811..613003307 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -36,9 +36,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#define CHECK_EPS 0.0002 +#define CHECK_EPS 0.00002 //Testcase list -void test_drot_incx_0(void); +void test_drot_inc_0(void); +void test_srot_inc_0(void); +void test_zdrot_inc_0(void); +void test_csrot_inc_0(void); + +void test_dswap_inc_0(void); +void test_zswap_inc_0(void); +void test_sswap_inc_0(void); +void test_cswap_inc_0(void); + +void test_daxpy_inc_0(void); +void test_zaxpy_inc_0(void); +void test_saxpy_inc_0(void); +void test_caxpy_inc_0(void); + +void test_zdotu_n_1(void); #endif diff --git a/utest/main.c b/utest/main.c index aac243eb9..c6fbd48e2 100644 --- a/utest/main.c +++ b/utest/main.c @@ -33,12 +33,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include - #include "common_utest.h" #include CU_TestInfo test_level1[]={ - {"Testing drot when incx & incy == 0",test_drot_incx_0}, + {"Testing srot when incx || incy == 0",test_srot_inc_0}, + {"Testing drot when incx || incy == 0",test_drot_inc_0}, + {"Testing csrot when incx || incy == 0",test_csrot_inc_0}, + {"Testing zdrot when incx || incy == 0",test_zdrot_inc_0}, + + {"Testing sswap with incx || incy == 0",test_sswap_inc_0}, + {"Testing dswap with incx || incy == 0",test_dswap_inc_0}, + {"Testing cswap with incx || incy == 0",test_cswap_inc_0}, + {"Testing zswap with incx || incy == 0",test_zswap_inc_0}, + + {"Testing saxpy with incx || incy == 0",test_saxpy_inc_0}, + {"Testing daxpy with incx || incy == 0",test_daxpy_inc_0}, + {"Testing caxpy with incx || incy == 0",test_caxpy_inc_0}, + {"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0}, + + {"Testing zdotu with n == 1",test_zdotu_n_1}, CU_TEST_INFO_NULL, }; @@ -64,7 +78,9 @@ int main() - + printf("Seting OK\n"); + fflush(stdout); + /* Run all tests using the CUnit Basic interface */ CU_basic_set_mode(CU_BRM_VERBOSE); diff --git a/utest/test_axpy.c b/utest/test_axpy.c new file mode 100644 index 000000000..a141d7a11 --- /dev/null +++ b/utest/test_axpy.c @@ -0,0 +1,117 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_daxpy_inc_0(void) +{ + int i; + int N=8,incX=0,incY=0; + double a=0.25; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + + //OpenBLAS + BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); + //reference + BLASFUNC_REF(daxpy)(&N,&a,x2,&incX,y2,&incY); + + for(i=0; i + +void test_zdotu_n_1(void) +{ + int N=1,incX=1,incY=1; + double x1[]={1.0,1.0}; + double y1[]={1.0,2.0}; + double x2[]={1.0,1.0}; + double y2[]={1.0,2.0}; + double _Complex result1=0.0; + double _Complex result2=0.0; + //OpenBLAS + result1=BLASFUNC(zdotu)(&N,x1,&incX,y1,&incY); + //reference + result2=BLASFUNC_REF(zdotu)(&N,x2,&incX,y2,&incY); + + CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); +// printf("\%lf,%lf\n",creal(result1),cimag(result1)); + +} + + diff --git a/utest/test_rot.c b/utest/test_rot.c index d02a137dd..f5332d486 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -32,9 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common_utest.h" -void test_drot_incx_0(void) +void test_drot_inc_0(void) { - int i; + int i=0; int N=4,incX=0,incY=0; double c=0.25,s=0.5; double x1[]={1.0,3.0,5.0,7.0}; @@ -43,12 +43,75 @@ void test_drot_incx_0(void) double y2[]={2.0,4.0,6.0,8.0}; //OpenBLAS - drot_(&N,x1,&incX,y1,&incY,&c,&s); + BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); //reference - drotf_(&N,x2,&incX,y2,&incY,&c,&s); + BLASFUNC_REF(drot)(&N,x2,&incX,y2,&incY,&c,&s); for(i=0; i