From b8b27bec5cbd1e8624a14556e15890958f29b193 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 16 Feb 2011 00:18:45 +0800 Subject: [PATCH 01/22] fixed a bug in drot whe incx or incy equals to zero. --- kernel/x86_64/rot_sse2.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S index 505554707..502940324 100644 --- a/kernel/x86_64/rot_sse2.S +++ b/kernel/x86_64/rot_sse2.S @@ -887,6 +887,10 @@ .L50: movq N, %rax + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 sarq $2, %rax jle .L55 ALIGN_3 From 84ba64e65b53ad871b0cedbe5e1feeec4eebfdcc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 16 Feb 2011 00:18:45 +0800 Subject: [PATCH 02/22] fixed a bug in drot whe incx or incy equals to zero. --- kernel/x86_64/rot_sse2.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S index 505554707..502940324 100644 --- a/kernel/x86_64/rot_sse2.S +++ b/kernel/x86_64/rot_sse2.S @@ -887,6 +887,10 @@ .L50: movq N, %rax + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 sarq $2, %rax jle .L55 ALIGN_3 From c79696cc61e8db0cfe05245cec4324de67191564 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 16 Feb 2011 23:32:13 +0800 Subject: [PATCH 03/22] Added rot testcase when incx == incy ==1. --- utest/common_utest.h | 5 +++- utest/main.c | 3 ++ utest/test_rot.c | 67 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/utest/common_utest.h b/utest/common_utest.h index 7d43b1811..595364c28 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -36,9 +36,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#define CHECK_EPS 0.0002 +#define CHECK_EPS 0.00002 //Testcase list void test_drot_incx_0(void); +void test_srot_incx_0(void); +void test_zdrot_incx_0(void); +void test_csrot_incx_0(void); #endif diff --git a/utest/main.c b/utest/main.c index aac243eb9..87620b591 100644 --- a/utest/main.c +++ b/utest/main.c @@ -38,7 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include CU_TestInfo test_level1[]={ + {"Testing srot when incx & incy == 0",test_srot_incx_0}, {"Testing drot when incx & incy == 0",test_drot_incx_0}, + {"Testing csrot when incx & incy == 0",test_csrot_incx_0}, + {"Testing zdrot when incx & incy == 0",test_zdrot_incx_0}, CU_TEST_INFO_NULL, }; diff --git a/utest/test_rot.c b/utest/test_rot.c index d02a137dd..18a2bbdc6 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -43,9 +43,72 @@ void test_drot_incx_0(void) double y2[]={2.0,4.0,6.0,8.0}; //OpenBLAS - drot_(&N,x1,&incX,y1,&incY,&c,&s); + BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); //reference - drotf_(&N,x2,&incX,y2,&incY,&c,&s); + BLASFUNC_REF(drot)(&N,x2,&incX,y2,&incY,&c,&s); + + for(i=0; i Date: Wed, 16 Feb 2011 23:39:43 +0800 Subject: [PATCH 04/22] fixed #4 csrot returned the wrong result when incx==incy==0. --- kernel/x86_64/zrot_sse.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S index 4aa0e7211..da79b4abe 100644 --- a/kernel/x86_64/zrot_sse.S +++ b/kernel/x86_64/zrot_sse.S @@ -1523,6 +1523,10 @@ .L50: movq N, %rax + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 sarq $2, %rax jle .L55 ALIGN_3 From bd7a74234fd2c2cf916891da748e61069974017e Mon Sep 17 00:00:00 2001 From: Xianyi Date: Fri, 18 Feb 2011 02:50:32 +0800 Subject: [PATCH 05/22] Disable quad and x precision objs in reference. --- reference/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reference/Makefile b/reference/Makefile index 6cbde28ef..034f23244 100644 --- a/reference/Makefile +++ b/reference/Makefile @@ -138,7 +138,8 @@ DBLASOBJS += \ dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ -QBLASOBJS += \ +QBLASOBJS += +# \ qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ qlaswpf.$(SUFFIX) qgetrsf.$(SUFFIX) qgesvf.$(SUFFIX) qpotrif.$(SUFFIX) \ @@ -153,7 +154,8 @@ ZBLASOBJS += \ zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ -XBLASOBJS += \ +XBLASOBJS += +# \ xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ From bfaa80c3164ae757ef8541f00d30f8300dda5ae0 Mon Sep 17 00:00:00 2001 From: Xianyi Date: Fri, 18 Feb 2011 03:00:58 +0800 Subject: [PATCH 06/22] fixed #4 csrot & drot returned the wrong result when incx==incy==0 on i686 arch. --- kernel/x86/rot_sse2.S | 4 ++++ kernel/x86/zrot_sse.S | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S index 8ec1d44bb..e9c5ba1ef 100644 --- a/kernel/x86/rot_sse2.S +++ b/kernel/x86/rot_sse2.S @@ -859,6 +859,10 @@ .L50: movl N, I + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 sarl $2, I jle .L55 ALIGN_3 diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S index d8d01009e..d10183f73 100644 --- a/kernel/x86/zrot_sse.S +++ b/kernel/x86/zrot_sse.S @@ -1285,6 +1285,12 @@ .L50: movl N, I +//if incx ==0 || incy==0 jump to the tail + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 + sarl $2, I jle .L55 ALIGN_3 From e51364edb4fbfac434b93639339d3ad655ede5ed Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 18 Feb 2011 22:08:10 +0800 Subject: [PATCH 07/22] Fixed #5 Detected Intel Westmere (using Nehalem codes) in build and dynamic arch build. Thanks Cao He from Dawning supporting Intel Xeon 5660 testbed. --- cpuid_x86.c | 19 +++++++++++++++++-- driver/others/dynamic.c | 5 +++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 288754497..98f744330 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -972,8 +972,15 @@ int get_cpuname(void){ return CPUTYPE_ATOM; case 13: return CPUTYPE_DUNNINGTON; - break; } + break; + case 2: + switch (model) { + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -1289,8 +1296,16 @@ int get_coretype(void){ return CORE_ATOM; case 13: return CORE_DUNNINGTON; - break; } + break; + case 2: + switch (model) { + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CORE_NEHALEM; + } + break; + } case 15: if (model <= 0x2) return CORE_NORTHWOOD; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4e27717fc..8288f33aa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -121,6 +121,11 @@ static gotoblas_t *get_coretype(void){ if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; if (model == 12) return &gotoblas_ATOM; return NULL; + + case 2: + //Intel Xeon Processor 5600 (Westmere-EP) + if (model == 12) return &gotoblas_NEHALEM; + return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; From 51454082c65bc8b980b6b6e03c991d4fbad4ea8a Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 19 Feb 2011 00:18:17 +0800 Subject: [PATCH 08/22] Updated readme file. --- README | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README b/README index 23cbdba72..0b31ad240 100644 --- a/README +++ b/README @@ -4,6 +4,8 @@ OpenBLAS Readme OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) 2.Intallation +Download from project homepage. http://xianyi.github.com/OpenBLAS/ +Or, check out codes from git://github.com/xianyi/OpenBLAS.git 1)Normal compile Please read GotoBLAS_02QuickInstall.txt or type "make" @@ -18,6 +20,12 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g 3.Support CPU & OS Please read GotoBLAS_01Readme.txt +Additional support CPU: +x86_64: + Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. +MIPS64: + ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. + 4.Usages Link with libopenblas.a or -lopenblas for shared library. @@ -31,7 +39,7 @@ OPENBLAS_NUM_THREAD is prior to OMP_NUM_THREADS. Please add a issue in https://github.com/xianyi/OpenBLAS/issues 6.To-Do List: -Support ICT Loongson 3A CPU +Optimize on ICT Loongson 3A CPU 7.Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas From 8dd3fd7f26895f9d42bffc7d80101c12735e9db4 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 20 Feb 2011 17:13:12 +0800 Subject: [PATCH 09/22] Added swap unit test with incx==0 and incy==0. --- common_reference.h | 7 +++ utest/Makefile | 4 +- utest/common_utest.h | 5 ++ utest/main.c | 11 +++- utest/test_swap.c | 116 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 utest/test_swap.c diff --git a/common_reference.h b/common_reference.h index d4dca859e..a324c4d9d 100644 --- a/common_reference.h +++ b/common_reference.h @@ -43,4 +43,11 @@ void BLASFUNC_REF(csrot) (blasint *, float *, blasint *, float *, blasint *, void BLASFUNC_REF(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC_REF(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); +void BLASFUNC_REF(sswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(dswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + #endif diff --git a/utest/Makefile b/utest/Makefile index 8b65a8d9d..2b92b56ef 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o +OBJS=main.o test_rot.o test_swap.o all : run_test $(TARGET): $(OBJS) - $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) + $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) run_test: $(TARGET) ./$(TARGET) diff --git a/utest/common_utest.h b/utest/common_utest.h index 595364c28..7e947b95d 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -44,4 +44,9 @@ void test_srot_incx_0(void); void test_zdrot_incx_0(void); void test_csrot_incx_0(void); +void test_dswap_inc_0(void); +void test_zswap_inc_0(void); +void test_sswap_inc_0(void); +void test_cswap_inc_0(void); + #endif diff --git a/utest/main.c b/utest/main.c index 87620b591..0e2d401b2 100644 --- a/utest/main.c +++ b/utest/main.c @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include - #include "common_utest.h" #include @@ -42,6 +41,12 @@ CU_TestInfo test_level1[]={ {"Testing drot when incx & incy == 0",test_drot_incx_0}, {"Testing csrot when incx & incy == 0",test_csrot_incx_0}, {"Testing zdrot when incx & incy == 0",test_zdrot_incx_0}, + + {"Testing sswap with incx & incy == 0",test_sswap_inc_0}, + {"Testing dswap with incx & incy == 0",test_dswap_inc_0}, + {"Testing cswap with incx & incy == 0",test_cswap_inc_0}, + {"Testing zswap with incx & incy == 0",test_zswap_inc_0}, + CU_TEST_INFO_NULL, }; @@ -67,7 +72,9 @@ int main() - + printf("Seting OK\n"); + fflush(stdout); + /* Run all tests using the CUnit Basic interface */ CU_basic_set_mode(CU_BRM_VERBOSE); diff --git a/utest/test_swap.c b/utest/test_swap.c new file mode 100644 index 000000000..5218a7198 --- /dev/null +++ b/utest/test_swap.c @@ -0,0 +1,116 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_dswap_inc_0(void) +{ + int i; + int N=4,incX=0,incY=0; + double x1[]={1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0}; + double y2[]={2.0,4.0,6.0,8.0}; + + //OpenBLAS + BLASFUNC(dswap)(&N,x1,&incX,y1,&incY); + //reference + BLASFUNC_REF(dswap)(&N,x2,&incX,y2,&incY); + + for(i=0; i Date: Sun, 20 Feb 2011 17:14:38 +0800 Subject: [PATCH 10/22] Fixed #6. Disable multi-thread swap when incx==0 or incy==0. --- interface/swap.c | 7 ++++++- interface/zswap.c | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/interface/swap.c b/interface/swap.c index 7676246f9..271fa083a 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -78,7 +78,12 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP nthreads = num_cpu_avail(1); - + + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index f4a03a550..06a889204 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -80,6 +80,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif From 109b86d00eaa6421dd6f4dd4c5bc2b0cd5eb50f8 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 21 Feb 2011 00:17:33 +0800 Subject: [PATCH 11/22] Added axpy unit test with incx==0 and incy==0. --- common_reference.h | 5 ++ utest/Makefile | 2 +- utest/common_utest.h | 5 ++ utest/main.c | 5 ++ utest/test_axpy.c | 117 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 utest/test_axpy.c diff --git a/common_reference.h b/common_reference.h index a324c4d9d..4bc46bcef 100644 --- a/common_reference.h +++ b/common_reference.h @@ -50,4 +50,9 @@ void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint * void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC_REF(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); + #endif diff --git a/utest/Makefile b/utest/Makefile index 2b92b56ef..defa2a7db 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,7 +5,7 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o test_swap.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o all : run_test diff --git a/utest/common_utest.h b/utest/common_utest.h index 7e947b95d..f4fc72878 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -49,4 +49,9 @@ void test_zswap_inc_0(void); void test_sswap_inc_0(void); void test_cswap_inc_0(void); +void test_daxpy_inc_0(void); +void test_zaxpy_inc_0(void); +void test_saxpy_inc_0(void); +void test_caxpy_inc_0(void); + #endif diff --git a/utest/main.c b/utest/main.c index 0e2d401b2..eddf1ce34 100644 --- a/utest/main.c +++ b/utest/main.c @@ -47,6 +47,11 @@ CU_TestInfo test_level1[]={ {"Testing cswap with incx & incy == 0",test_cswap_inc_0}, {"Testing zswap with incx & incy == 0",test_zswap_inc_0}, + {"Testing saxpy with incx & incy == 0",test_saxpy_inc_0}, + {"Testing daxpy with incx & incy == 0",test_daxpy_inc_0}, + {"Testing caxpy with incx & incy == 0",test_caxpy_inc_0}, + {"Testing zaxpy with incx & incy == 0",test_zaxpy_inc_0}, + CU_TEST_INFO_NULL, }; diff --git a/utest/test_axpy.c b/utest/test_axpy.c new file mode 100644 index 000000000..a141d7a11 --- /dev/null +++ b/utest/test_axpy.c @@ -0,0 +1,117 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_daxpy_inc_0(void) +{ + int i; + int N=8,incX=0,incY=0; + double a=0.25; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + + //OpenBLAS + BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); + //reference + BLASFUNC_REF(daxpy)(&N,&a,x2,&incX,y2,&incY); + + for(i=0; i Date: Mon, 21 Feb 2011 00:24:21 +0800 Subject: [PATCH 12/22] Fixed #7. 1)Disable the multi-thread and 2) Modified kernel codes to avoid unloop in axpy function when incx==0 or incy==0. --- interface/axpy.c | 5 +++++ interface/zaxpy.c | 5 +++++ kernel/x86_64/axpy_sse.S | 6 ++++++ kernel/x86_64/axpy_sse2.S | 6 ++++++ kernel/x86_64/zaxpy_sse.S | 40 ++++++++++++++++++++++++++++++++++++++ kernel/x86_64/zaxpy_sse2.S | 10 ++++++++++ 6 files changed, 72 insertions(+) diff --git a/interface/axpy.c b/interface/axpy.c index 03b981985..dd75b758c 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -81,6 +81,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index d3355ea57..9ed72efb9 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -83,6 +83,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 23c2ec54e..9a7512575 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -1463,6 +1463,12 @@ .L50: movq M, %rax movq Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 + sarq $3, %rax jle .L55 ALIGN_3 diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index 554602917..dea8d0382 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -805,6 +805,12 @@ .L40: movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L46 + cmpq $0, INCY + je .L46 + sarq $3, %rax jle .L45 ALIGN_3 diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S index 69cdedaaa..42b920cfb 100644 --- a/kernel/x86_64/zaxpy_sse.S +++ b/kernel/x86_64/zaxpy_sse.S @@ -2893,6 +2893,12 @@ unpcklps %xmm13, %xmm15 #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L200 + cmpq $0, INCY + je .L200 + movq Y, YY movq M, %rax @@ -3105,8 +3111,42 @@ addps %xmm1, %xmm8 movsd %xmm8, (Y) + jmp .L999 ALIGN_3 + +.L200: + movq M, %rax + cmpq $0, %rax + jle .L999 + ALIGN_3 +.L201: + movsd (X), %xmm0 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (Y) + addq INCY, Y + + decq %rax + jg .L201 + ALIGN_3 + .L999: xorq %rax, %rax diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index f1616e362..1b7e3a563 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1416,6 +1416,12 @@ movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L58 + cmpq $0, INCY + je .L58 + sarq $3, %rax jle .L55 @@ -1769,6 +1775,7 @@ andq $1, %rax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1781,6 +1788,9 @@ movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) + + decq %rax + jg .L58 ALIGN_3 .L999: From afbe3c97914ad15f2c83d2fd02005c794a37135b Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 21 Feb 2011 00:42:46 +0800 Subject: [PATCH 13/22] =?UTF-8?q?Improved=20the=20quality=20of=20codes=20i?= =?UTF-8?q?n=20unit=20test.=20Thanks=20Jos=C3=A9=20Luis=20Garc=C3=ADa=20Pa?= =?UTF-8?q?llero?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common_reference.h | 20 ++++++++++---------- utest/common_utest.h | 8 ++++---- utest/main.c | 24 ++++++++++++------------ utest/test_rot.c | 20 ++++++++++---------- utest/test_swap.c | 37 +++++++++++++++++-------------------- 5 files changed, 53 insertions(+), 56 deletions(-) diff --git a/common_reference.h b/common_reference.h index 4bc46bcef..27a27a638 100644 --- a/common_reference.h +++ b/common_reference.h @@ -43,16 +43,16 @@ void BLASFUNC_REF(csrot) (blasint *, float *, blasint *, float *, blasint *, void BLASFUNC_REF(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC_REF(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); -void BLASFUNC_REF(sswap) (blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC_REF(dswap) (blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC_REF(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC_REF(sswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(dswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC_REF(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); -void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); -void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); -void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); #endif diff --git a/utest/common_utest.h b/utest/common_utest.h index f4fc72878..43c7f7f82 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -39,10 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CHECK_EPS 0.00002 //Testcase list -void test_drot_incx_0(void); -void test_srot_incx_0(void); -void test_zdrot_incx_0(void); -void test_csrot_incx_0(void); +void test_drot_inc_0(void); +void test_srot_inc_0(void); +void test_zdrot_inc_0(void); +void test_csrot_inc_0(void); void test_dswap_inc_0(void); void test_zswap_inc_0(void); diff --git a/utest/main.c b/utest/main.c index eddf1ce34..ef4232dbe 100644 --- a/utest/main.c +++ b/utest/main.c @@ -37,20 +37,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include CU_TestInfo test_level1[]={ - {"Testing srot when incx & incy == 0",test_srot_incx_0}, - {"Testing drot when incx & incy == 0",test_drot_incx_0}, - {"Testing csrot when incx & incy == 0",test_csrot_incx_0}, - {"Testing zdrot when incx & incy == 0",test_zdrot_incx_0}, + {"Testing srot when incx || incy == 0",test_srot_inc_0}, + {"Testing drot when incx || incy == 0",test_drot_inc_0}, + {"Testing csrot when incx || incy == 0",test_csrot_inc_0}, + {"Testing zdrot when incx || incy == 0",test_zdrot_inc_0}, - {"Testing sswap with incx & incy == 0",test_sswap_inc_0}, - {"Testing dswap with incx & incy == 0",test_dswap_inc_0}, - {"Testing cswap with incx & incy == 0",test_cswap_inc_0}, - {"Testing zswap with incx & incy == 0",test_zswap_inc_0}, + {"Testing sswap with incx || incy == 0",test_sswap_inc_0}, + {"Testing dswap with incx || incy == 0",test_dswap_inc_0}, + {"Testing cswap with incx || incy == 0",test_cswap_inc_0}, + {"Testing zswap with incx || incy == 0",test_zswap_inc_0}, - {"Testing saxpy with incx & incy == 0",test_saxpy_inc_0}, - {"Testing daxpy with incx & incy == 0",test_daxpy_inc_0}, - {"Testing caxpy with incx & incy == 0",test_caxpy_inc_0}, - {"Testing zaxpy with incx & incy == 0",test_zaxpy_inc_0}, + {"Testing saxpy with incx || incy == 0",test_saxpy_inc_0}, + {"Testing daxpy with incx || incy == 0",test_daxpy_inc_0}, + {"Testing caxpy with incx || incy == 0",test_caxpy_inc_0}, + {"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0}, CU_TEST_INFO_NULL, }; diff --git a/utest/test_rot.c b/utest/test_rot.c index 18a2bbdc6..f5332d486 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -32,9 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common_utest.h" -void test_drot_incx_0(void) +void test_drot_inc_0(void) { - int i; + int i=0; int N=4,incX=0,incY=0; double c=0.25,s=0.5; double x1[]={1.0,3.0,5.0,7.0}; @@ -53,9 +53,9 @@ void test_drot_incx_0(void) } } -void test_zdrot_incx_0(void) +void test_zdrot_inc_0(void) { - int i; + int i=0; int N=4,incX=0,incY=0; double c=0.25,s=0.5; double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; @@ -68,15 +68,15 @@ void test_zdrot_incx_0(void) //reference BLASFUNC_REF(zdrot)(&N,x2,&incX,y2,&incY,&c,&s); - for(i=0; i Date: Tue, 22 Feb 2011 13:40:40 +0800 Subject: [PATCH 14/22] Supported building debug version. --- Makefile.rule | 6 ++++++ README | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index ecafe0cdc..2fc82619b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -89,7 +89,13 @@ VERSION = 0.1 # UTEST_CHECK = 1 # Common Optimization Flag; -O2 is enough. +# DEBUG = 1 + +ifeq ($(DEBUG), 1) +COMMON_OPT += -g +else COMMON_OPT += -O2 +endif # Profiling flags COMMON_PROF = -pg diff --git a/README b/README index 0b31ad240..1a10cb198 100644 --- a/README +++ b/README @@ -17,6 +17,9 @@ examples: On X86 box, compile this library for loongson3a CPU. make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A +3)Debug version +make DEBUG=1 + 3.Support CPU & OS Please read GotoBLAS_01Readme.txt @@ -39,7 +42,7 @@ OPENBLAS_NUM_THREAD is prior to OMP_NUM_THREADS. Please add a issue in https://github.com/xianyi/OpenBLAS/issues 6.To-Do List: -Optimize on ICT Loongson 3A CPU +Optimization on ICT Loongson 3A CPU 7.Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas From cd2cbabecc7fe0a69e6d9120cda013551f506a0d Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 22 Feb 2011 14:16:46 +0800 Subject: [PATCH 15/22] Added unit test case (zdotu, N=1). --- common_reference.h | 5 ++++ utest/Makefile | 2 +- utest/common_utest.h | 2 ++ utest/main.c | 1 + utest/test_dotu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 utest/test_dotu.c diff --git a/common_reference.h b/common_reference.h index 27a27a638..04b11f80f 100644 --- a/common_reference.h +++ b/common_reference.h @@ -55,4 +55,9 @@ void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, bl void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +float _Complex BLASFUNC_REF(cdotu) (blasint *, float *, blasint *, float *, blasint *); +float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *, blasint *); +double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); +double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); + #endif diff --git a/utest/Makefile b/utest/Makefile index defa2a7db..9d512b877 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -5,7 +5,7 @@ include $(TOPDIR)/Makefile.system TARGET=openblas_utest CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o test_swap.o test_axpy.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o all : run_test diff --git a/utest/common_utest.h b/utest/common_utest.h index 43c7f7f82..613003307 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -54,4 +54,6 @@ void test_zaxpy_inc_0(void); void test_saxpy_inc_0(void); void test_caxpy_inc_0(void); +void test_zdotu_n_1(void); + #endif diff --git a/utest/main.c b/utest/main.c index ef4232dbe..c6fbd48e2 100644 --- a/utest/main.c +++ b/utest/main.c @@ -52,6 +52,7 @@ CU_TestInfo test_level1[]={ {"Testing caxpy with incx || incy == 0",test_caxpy_inc_0}, {"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0}, + {"Testing zdotu with n == 1",test_zdotu_n_1}, CU_TEST_INFO_NULL, }; diff --git a/utest/test_dotu.c b/utest/test_dotu.c new file mode 100644 index 000000000..bb720c85a --- /dev/null +++ b/utest/test_dotu.c @@ -0,0 +1,56 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" +#include + +void test_zdotu_n_1(void) +{ + int N=1,incX=1,incY=1; + double x1[]={1.0,1.0}; + double y1[]={1.0,2.0}; + double x2[]={1.0,1.0}; + double y2[]={1.0,2.0}; + double _Complex result1=0.0; + double _Complex result2=0.0; + //OpenBLAS + result1=BLASFUNC(zdotu)(&N,x1,&incX,y1,&incY); + //reference + result2=BLASFUNC_REF(zdotu)(&N,x2,&incX,y2,&incY); + + CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); +// printf("\%lf,%lf\n",creal(result1),cimag(result1)); + +} + + From 12214e1d0f9b7e8abc4e1776d64d1527d15b38a7 Mon Sep 17 00:00:00 2001 From: Xianyi Date: Wed, 23 Feb 2011 20:08:34 +0800 Subject: [PATCH 16/22] Fixed #7. Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch. --- kernel/x86/axpy_sse.S | 6 ++++++ kernel/x86/axpy_sse2.S | 6 ++++++ kernel/x86/zaxpy_sse.S | 38 ++++++++++++++++++++++++++++++++++++++ kernel/x86/zaxpy_sse2.S | 11 +++++++++++ 4 files changed, 61 insertions(+) diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S index 291a219ce..e06d90184 100644 --- a/kernel/x86/axpy_sse.S +++ b/kernel/x86/axpy_sse.S @@ -1440,6 +1440,12 @@ .L50: movl M, %eax movl Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 + sarl $3, %eax jle .L55 ALIGN_3 diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S index 5e31d3dba..9b2d5d808 100644 --- a/kernel/x86/axpy_sse2.S +++ b/kernel/x86/axpy_sse2.S @@ -698,6 +698,12 @@ .L40: movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L46 + cmpl $0, INCY + je .L46 + sarl $3, %eax jle .L45 ALIGN_3 diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S index edd9929cd..9c94cec44 100644 --- a/kernel/x86/zaxpy_sse.S +++ b/kernel/x86/zaxpy_sse.S @@ -2857,6 +2857,11 @@ unpcklps ALPHA_I, ALPHA_R unpcklps %xmm5, ALPHA_I #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L200 + cmpl $0, INCY + je .L200 movl Y, YY @@ -3090,8 +3095,41 @@ addps %xmm1, %xmm4 movsd %xmm4, (Y) + jmp .L999 ALIGN_3 +.L200: + movl M, %eax + cmpl $0, %eax + jle .L999 + ALIGN_3 + +.L201: + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (Y) + + decl %eax + jg .L201 + + ALIGN_3 .L999: popl %ebp popl %ebx diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S index 40afdc3fc..9c2caa7e8 100644 --- a/kernel/x86/zaxpy_sse2.S +++ b/kernel/x86/zaxpy_sse2.S @@ -1318,6 +1318,12 @@ movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L58 + cmpl $0, INCY + je .L58 + sarl $2, %eax jle .L55 @@ -1498,6 +1504,7 @@ andl $1, %eax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1510,6 +1517,10 @@ movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) + + + decl %eax + jg .L58 ALIGN_3 .L999: From 128418f49baf98616418677fd2f06dee3b00ece2 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 24 Feb 2011 15:16:21 +0800 Subject: [PATCH 17/22] Fixed #10. Supported GOTO_NUM_THREADS & GOTO_THREADS_TIMEOUT environment variables. --- Makefile.rule | 2 +- README | 12 ++++++++++-- driver/others/blas_server.c | 16 +++++++++++++++- driver/others/init.c | 3 ++- driver/others/memory.c | 7 +++++++ 5 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 2fc82619b..997b52d98 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -70,7 +70,7 @@ VERSION = 0.1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by GOTO_THREAD_TIMEOUT +# system). Also you can control this mumber by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 # Using special device driver for mapping physically contigous memory diff --git a/README b/README index 1a10cb198..d1846399c 100644 --- a/README +++ b/README @@ -32,11 +32,19 @@ MIPS64: 4.Usages Link with libopenblas.a or -lopenblas for shared library. -Set the number of threads. for example, +4.1 Set the number of threads with environment variables. for example, export OPENBLAS_NUM_THREADS=4 + or +export GOTO_NUM_THREADS=4 or export OMP_NUM_THREADS=4 -OPENBLAS_NUM_THREAD is prior to OMP_NUM_THREADS. + +The priorities are OPENBLAS_NUM_THREAD > GOTO_NUM_THREADS > OMP_NUM_THREADS. + +4.2 Set the number of threads with calling functions. for example, +void goto_set_num_threads(int num_threads); +or +void openblas_set_num_threads(int num_threads); 5.Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 11f058e96..c0f77c4c9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -525,7 +525,16 @@ int blas_thread_init(void){ if (thread_timeout < 4) thread_timeout = 4; if (thread_timeout > 30) thread_timeout = 30; thread_timeout = (1 << thread_timeout); - } + }else{ + p = getenv("GOTO_THREAD_TIMEOUT"); + if (p) { + thread_timeout = atoi(p); + if (thread_timeout < 4) thread_timeout = 4; + if (thread_timeout > 30) thread_timeout = 30; + thread_timeout = (1 << thread_timeout); + } + } + for(i = 0; i < blas_num_threads - 1; i++){ @@ -790,6 +799,11 @@ void goto_set_num_threads(int num_threads) { } +void openblas_set_num_threads(int num_threads) { + goto_set_num_threads(num_threads); + +} + /* Compatible function with pthread_create / join */ int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { diff --git a/driver/others/init.c b/driver/others/init.c index 94f883728..452656c55 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -581,6 +581,7 @@ void gotoblas_affinity_init(void) { numprocs = 0; #else numprocs = readenv("OPENBLAS_NUM_THREADS"); + if (numprocs == 0) numprocs = readenv("GOTO_NUM_THREADS"); #endif if (numprocs == 0) numprocs = readenv("OMP_NUM_THREADS"); @@ -666,7 +667,7 @@ void gotoblas_affinity_init(void) { setup_mempolicy(); - if (readenv("OPENBLAS_MAIN_FREE")) { + if (readenv("OPENBLAS_MAIN_FREE") || readenv("GOTOBLAS_MAIN_FREE")) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); } diff --git a/driver/others/memory.c b/driver/others/memory.c index fc5265715..fa41465f6 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -231,6 +231,13 @@ int blas_get_cpu_number(void){ p = getenv("OPENBLAS_NUM_THREADS"); if (p) blas_goto_num = atoi(p); if (blas_goto_num < 0) blas_goto_num = 0; + + if (blas_goto_num == 0) { + p = getenv("GOTO_NUM_THREADS"); + if (p) blas_goto_num = atoi(p); + if (blas_goto_num < 0) blas_goto_num = 0; + } + #endif blas_omp_num = 0; From 1b97ec1a7c6ebb8e1a8c1a01f90f23d835868770 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 26 Feb 2011 11:19:54 +0800 Subject: [PATCH 18/22] Added DEBUG option in Makefile.rule. Fixed DEBUG typo mistakes. --- Makefile.rule | 2 +- driver/level3/gemm3m_level3.c | 2 +- driver/level3/level3.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 997b52d98..d9013dd83 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -92,7 +92,7 @@ VERSION = 0.1 # DEBUG = 1 ifeq ($(DEBUG), 1) -COMMON_OPT += -g +COMMON_OPT += -g -DDEBUG else COMMON_OPT += -O2 endif diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index 8c5473c03..df4d723ab 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -297,7 +297,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, printf("GEMM: SA .. %p SB .. %p\n", sa, sb); #endif -#ifdef DEBUG +#ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 62b310aba..20e811cd0 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -278,7 +278,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); #endif -#ifdef DEBUG +#ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; From f7a5e049e2653149aedb35cb172ec89e4748a222 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 26 Feb 2011 11:51:39 +0800 Subject: [PATCH 19/22] Enable Debug flags in memory alloc and init functions. --- driver/others/init.c | 2 +- driver/others/memory.c | 25 +++++++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 452656c55..7ee7dc45d 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -92,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map" #define NODE_DIR "/sys/devices/system/node" -#undef DEBUG +//#undef DEBUG /* Private variables */ typedef struct { diff --git a/driver/others/memory.c b/driver/others/memory.c index fa41465f6..dd8334477 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -68,9 +68,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#undef DEBUG +//#undef DEBUG #include "common.h" +#include #ifdef OS_WINDOWS #define ALLOC_WINDOWS @@ -386,11 +387,23 @@ static void *alloc_mmap(void *address){ MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { - + #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#ifdef DEBUG + int ret; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif #endif + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; @@ -986,7 +999,7 @@ void *blas_memory_alloc(int procpos){ memory[position].addr = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_area[position], position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } @@ -1017,7 +1030,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf("Mapped : %p %3d\n\n", - (void *)alloc_area[position], position); + (void *)memory[position].addr, position); #endif return (void *)memory[position].addr; @@ -1060,7 +1073,7 @@ void blas_memory_free(void *free_area){ #ifdef DEBUG for (position = 0; position < NUM_BUFFERS; position++) - printf("%4ld %p : %d\n", position, alloc_area[position], alloc_used[position]); + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif return; From cdf33edac37eb667a3c1c2fc2ca9450781a03933 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 26 Feb 2011 12:27:56 +0800 Subject: [PATCH 20/22] Added Changelog. Fixed #11. --- Changelog.txt | 25 +++++++++++++++++++++++++ README | 3 +++ 2 files changed, 28 insertions(+) create mode 100644 Changelog.txt diff --git a/Changelog.txt b/Changelog.txt new file mode 100644 index 000000000..7a05cdc4b --- /dev/null +++ b/Changelog.txt @@ -0,0 +1,25 @@ +OpenBLAS ChangeLog +==================================================================== +Version 0.1 (in development) +26-Feb-2011 + +common: + * Added DEBUG=1 rule in Makefile.rule to build debug version. + * Disable compiling quad precision in reference BLAS library(netlib BLAS). + * Added unit testcases in utest/ subdir. Used CUnit framework. + * Supported OPENBLAS_* & GOTO_* environment variables (Pleas see README) + * Imported GotoBLAS2 1.13 BSD version + +x86/x86 64: + * Modified ?axpy functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #7 on github) + * Modified ?swap functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #6 on github) + * Modified ?rot functions to return same netlib BLAS results + when incx==0 or incy==0 (Refs issue #4 on github) + * Detect Intel Westmere to use Nehalem codes. + * Fixed a typo bug about compiling dynamic ARCH library. +MIPS64: + * Improve daxpy performance on ICT Loongson 3A. + * Supported ICT Loongson 3A CPU (Refs issue #1 on github) +==================================================================== diff --git a/README b/README index d1846399c..9b04f6f99 100644 --- a/README +++ b/README @@ -54,3 +54,6 @@ Optimization on ICT Loongson 3A CPU 7.Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas + +8.ChangeLog +Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. \ No newline at end of file From 588737210d1db00f7e271ea451c26380e43c1ed9 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 2 Mar 2011 13:38:32 +0800 Subject: [PATCH 21/22] Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. Fixed #12. Thank Mr.Ei-ji Nakama providing this patch. --- common_linux.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common_linux.h b/common_linux.h index d18cd2b72..8b3d44bfa 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,8 +68,9 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { - - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 + unsigned long null_nodemask=0; + return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { From 0e782b9bd31de13fb1898af3c5c8c9ede6cada30 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 2 Mar 2011 13:40:55 +0800 Subject: [PATCH 22/22] updated the changelog. --- Changelog.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 7a05cdc4b..b3c438471 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -4,6 +4,8 @@ Version 0.1 (in development) 26-Feb-2011 common: + * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. + Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github) * Added DEBUG=1 rule in Makefile.rule to build debug version. * Disable compiling quad precision in reference BLAS library(netlib BLAS). * Added unit testcases in utest/ subdir. Used CUnit framework.