From ce5626a3841604bf2e9e29a7caf149695c3e3748 Mon Sep 17 00:00:00 2001 From: yieldthought Date: Tue, 8 Oct 2013 16:37:17 +0200 Subject: [PATCH 01/11] Remove -Wl,--retain-symbols-file from dynamic library linking to fix tool support The aim is to restrict the symbols exported from openBLAS, but for dynamic libraries --retain-symbols-file has the opposite effect intended. It removes symbols from the .symtab section of the library, which is used by tools such as debuggers, profilers, objdump etc. but does not remove them from .dynsym, which is the section used by the runtime when loading the binary. In short, using --retain-symbols-file like this doesn't prevent symbol leakage but does prevent tools from analyzing applications linked with openBLAS. This patch improves tool support for openBLAS applications but does not address symbol leakage through .dynsym. --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 1531f3cc1..0bc9ec6e0 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -126,7 +126,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) + -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else @@ -145,7 +145,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) + $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From beffee7d914702f5b7211d41e3a77cd08d6c4d96 Mon Sep 17 00:00:00 2001 From: wangqian Date: Fri, 11 Oct 2013 03:20:20 +0800 Subject: [PATCH 02/11] Fixed buffer overflow bug in kernel/x86_64/dgemv_t.S file. --- kernel/x86_64/dgemv_t.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S index 48b3f17c4..3d132c3b5 100644 --- a/kernel/x86_64/dgemv_t.S +++ b/kernel/x86_64/dgemv_t.S @@ -171,7 +171,7 @@ .L0x: xorq M,M addq $1,M - salq $22,M + salq $21,M subq M,MMM jge .L00 From c937090121258e1b85f80af5c87c092d8283ffaa Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Oct 2013 13:24:47 +0800 Subject: [PATCH 03/11] Added gfortran dependency for LSB/lsbcc. --- Makefile.system | 2 -- exports/Makefile | 9 ++++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 858160fc4..1e9248a6d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -470,10 +470,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) -ifneq ($(C_COMPILER), LSB) EXTRALIB += -lgfortran endif -endif ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) ifdef BINARY64 diff --git a/exports/Makefile b/exports/Makefile index 0bc9ec6e0..780b0dcf8 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -124,14 +124,17 @@ ifeq ($(OSNAME), Linux) so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c +ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) -ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else -#Use FC on LSB - $(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +#for LSB + env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) + $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. endif rm -f linktest From f5a0038bad348c1e7c917beba076a0ce74117e71 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 23 Oct 2013 18:43:00 -0400 Subject: [PATCH 04/11] Use FC instead of CC to link the dynamic library on OS X Avoids problems of libgfortran not being found. --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 780b0dcf8..8e50a9809 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -111,7 +111,7 @@ libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) symbol.$(SUFFIX) : symbol.S $(CC) $(CFLAGS) -c -o $(@F) $^ From dfd1064d7be6b5c43759c38cab79f094a453e906 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Nov 2013 15:09:33 +0800 Subject: [PATCH 05/11] refs #287. Don't enable OpenMP for netlib LAPACK sequential Fortran codes. --- Makefile | 8 ++++---- Makefile.system | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 67d425359..294502f81 100644 --- a/Makefile +++ b/Makefile @@ -219,10 +219,10 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.system b/Makefile.system index 1e9248a6d..da6a6a560 100644 --- a/Makefile.system +++ b/Makefile.system @@ -840,6 +840,10 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = +#For LAPACK Fortran codes. +LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) +LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) + LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 From 6d9d70c55c5755d3ca666d92f626539c23063dd6 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Nov 2013 15:59:00 +0800 Subject: [PATCH 06/11] Fixed #315. Added OPENBLAS_ prefix to openblas_config.h. --- Makefile.install | 4 ++-- openblas_config_template.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Makefile.install b/Makefile.install index cbe98bc5b..8319b46db 100644 --- a/Makefile.install +++ b/Makefile.install @@ -23,8 +23,8 @@ install : lib.grd #for inc @echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h diff --git a/openblas_config_template.h b/openblas_config_template.h index 1017caff9..3b3435b0e 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -1,8 +1,8 @@ /*This is only for "make install" target.*/ -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) -#define WINDOWS_ABI -#define OS_WINDOWS +#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX) +#define OPENBLAS_WINDOWS_ABI +#define OPENBLAS_OS_WINDOWS #ifdef DOUBLE #define DOUBLE_DEFINED DOUBLE @@ -10,23 +10,23 @@ #endif #endif -#ifdef NEEDBUNDERSCORE +#ifdef OPENBLAS_NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ #else #define BLASFUNC(FUNC) FUNC #endif -#ifdef QUAD_PRECISION +#ifdef OPENBLAS_QUAD_PRECISION typedef struct { unsigned long x[2]; } xdouble; -#elif defined EXPRECISION +#elif defined OPENBLAS_EXPRECISION #define xdouble long double #else #define xdouble double #endif -#if defined(OS_WINDOWS) && defined(__64BIT__) +#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; #else @@ -34,7 +34,7 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#ifdef USE64BITINT +#ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else typedef int blasint; From 73770e60b898cabc6d0d6ff10f99aba9baa56f9b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 7 Nov 2013 01:08:39 +0800 Subject: [PATCH 07/11] Refs #309. Fixed trtri_U single thread computational bug. --- interface/trtri.c | 13 -- lapack/trtri/Makefile | 4 - lapack/trtri/dtrtri_lapack.f | 242 ---------------------------------- lapack/trtri/trtri_U_single.c | 19 ++- 4 files changed, 15 insertions(+), 263 deletions(-) delete mode 100644 lapack/trtri/dtrtri_lapack.f diff --git a/interface/trtri.c b/interface/trtri.c index 007dbd7fa..5aa5e9b9b 100644 --- a/interface/trtri.c +++ b/interface/trtri.c @@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT * }; #endif -extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info); int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ @@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (args.nthreads == 1) { #endif -#if DOUBLE - // double trtri_U single thread error - // call dtrtri from lapack for a walk around. - if(uplo==0){ - BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info); -#ifndef PPC440 - blas_memory_free(buffer); -#endif - return 0; - } -#endif - *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile index 10d3cb7fd..626c47bbf 100644 --- a/lapack/trtri/Makefile +++ b/lapack/trtri/Makefile @@ -13,7 +13,6 @@ ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_sing XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) -DBLASOBJS += dtrtri_lapack.$(SUFFIX) ifdef SMP SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) @@ -54,9 +53,6 @@ dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) -dtrtri_lapack.$(SUFFIX) : dtrtri_lapack.f - $(FC) -c $(FFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) - dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) diff --git a/lapack/trtri/dtrtri_lapack.f b/lapack/trtri/dtrtri_lapack.f deleted file mode 100644 index 8e9a08170..000000000 --- a/lapack/trtri/dtrtri_lapack.f +++ /dev/null @@ -1,242 +0,0 @@ -*> \brief \b DTRTRI -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -*> \htmlonly -*> Download DTRTRI + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> -*> [TXT] -*> \endhtmlonly -* -* Definition: -* =========== -* -* SUBROUTINE DTRTRI( UPLO, DIAG, N, A, LDA, INFO ) -* -* .. Scalar Arguments .. -* CHARACTER DIAG, UPLO -* INTEGER INFO, LDA, N -* .. -* .. Array Arguments .. -* DOUBLE PRECISION A( LDA, * ) -* .. -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> DTRTRI computes the inverse of a real upper or lower triangular -*> matrix A. -*> -*> This is the Level 3 BLAS version of the algorithm. -*> \endverbatim -* -* Arguments: -* ========== -* -*> \param[in] UPLO -*> \verbatim -*> UPLO is CHARACTER*1 -*> = 'U': A is upper triangular; -*> = 'L': A is lower triangular. -*> \endverbatim -*> -*> \param[in] DIAG -*> \verbatim -*> DIAG is CHARACTER*1 -*> = 'N': A is non-unit triangular; -*> = 'U': A is unit triangular. -*> \endverbatim -*> -*> \param[in] N -*> \verbatim -*> N is INTEGER -*> The order of the matrix A. N >= 0. -*> \endverbatim -*> -*> \param[in,out] A -*> \verbatim -*> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the triangular matrix A. If UPLO = 'U', the -*> leading N-by-N upper triangular part of the array A contains -*> the upper triangular matrix, and the strictly lower -*> triangular part of A is not referenced. If UPLO = 'L', the -*> leading N-by-N lower triangular part of the array A contains -*> the lower triangular matrix, and the strictly upper -*> triangular part of A is not referenced. If DIAG = 'U', the -*> diagonal elements of A are also not referenced and are -*> assumed to be 1. -*> On exit, the (triangular) inverse of the original matrix, in -*> the same storage format. -*> \endverbatim -*> -*> \param[in] LDA -*> \verbatim -*> LDA is INTEGER -*> The leading dimension of the array A. LDA >= max(1,N). -*> \endverbatim -*> -*> \param[out] INFO -*> \verbatim -*> INFO is INTEGER -*> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, A(i,i) is exactly zero. The triangular -*> matrix is singular and its inverse can not be computed. -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2011 -* -*> \ingroup doubleOTHERcomputational -* -* ===================================================================== - SUBROUTINE DTRTRILAPACK( UPLO, DIAG, N, A, LDA, INFO ) -* -* -- LAPACK computational routine (version 3.4.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 -* -* .. Scalar Arguments .. - CHARACTER DIAG, UPLO - INTEGER INFO, LDA, N -* .. -* .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ) -* .. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE, ZERO - PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) -* .. -* .. Local Scalars .. - LOGICAL NOUNIT, UPPER - INTEGER J, JB, NB, NN -* .. -* .. External Functions .. - LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV -* .. -* .. External Subroutines .. - EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX, MIN -* .. -* .. Executable Statements .. -* -* Test the input parameters. -* - INFO = 0 - UPPER = LSAME( UPLO, 'U' ) - NOUNIT = LSAME( DIAG, 'N' ) - IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN - INFO = -1 - ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN - INFO = -2 - ELSE IF( N.LT.0 ) THEN - INFO = -3 - ELSE IF( LDA.LT.MAX( 1, N ) ) THEN - INFO = -5 - END IF - IF( INFO.NE.0 ) THEN - CALL XERBLA( 'DTRTRI', -INFO ) - RETURN - END IF -* -* Quick return if possible -* - IF( N.EQ.0 ) - $ RETURN -* -* Check for singularity if non-unit. -* - IF( NOUNIT ) THEN - DO 10 INFO = 1, N - IF( A( INFO, INFO ).EQ.ZERO ) - $ RETURN - 10 CONTINUE - INFO = 0 - END IF -* -* Determine the block size for this environment. -* - NB = ILAENV( 1, 'DTRTRI', UPLO // DIAG, N, -1, -1, -1 ) - IF( NB.LE.1 .OR. NB.GE.N ) THEN -* -* Use unblocked code -* - CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) - ELSE -* -* Use blocked code -* - IF( UPPER ) THEN -* -* Compute inverse of upper triangular matrix -* - DO 20 J = 1, N, NB - JB = MIN( NB, N-J+1 ) -* -* Compute rows 1:j-1 of current block column -* - CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, - $ JB, ONE, A, LDA, A( 1, J ), LDA ) - CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, - $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) -* -* Compute inverse of current diagonal block -* - CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) - 20 CONTINUE - ELSE -* -* Compute inverse of lower triangular matrix -* - NN = ( ( N-1 ) / NB )*NB + 1 - DO 30 J = NN, 1, -NB - JB = MIN( NB, N-J+1 ) - IF( J+JB.LE.N ) THEN -* -* Compute rows j+jb:n of current block column -* - CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, - $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, - $ A( J+JB, J ), LDA ) - CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, - $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, - $ A( J+JB, J ), LDA ) - END IF -* -* Compute inverse of current diagonal block -* - CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) - 30 CONTINUE - END IF - END IF -* - RETURN -* -* End of DTRTRI -* - END diff --git a/lapack/trtri/trtri_U_single.c b/lapack/trtri/trtri_U_single.c index 72133d896..c79281cfb 100644 --- a/lapack/trtri/trtri_U_single.c +++ b/lapack/trtri/trtri_U_single.c @@ -127,8 +127,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (min_i > GEMM_P) min_i = GEMM_P; if (ls == i + bk) { - NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - + //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + GEMM_BETA(min_i, bk, 0, dm1, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + TRSM_KERNEL_RN(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, @@ -171,8 +177,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_i = i - is; if (min_i > GEMM_P) min_i = GEMM_P; - NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - + //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + GEMM_BETA(min_i, bk, 0, dm1, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + TRSM_KERNEL_RN(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, From 2f5fdd2000cf316e23d9f43fc3049d18a1290289 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 7 Nov 2013 08:12:03 +0800 Subject: [PATCH 08/11] Refs #314. Fixed clang compiling bug on OSX. --- kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S | 16 ++++++++-------- kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S index 374f45096..9e15fa240 100644 --- a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S @@ -103,7 +103,7 @@ vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x2 @@ -265,7 +265,7 @@ vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -338,7 +338,7 @@ vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -378,7 +378,7 @@ vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x2 @@ -411,7 +411,7 @@ vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x1 @@ -510,7 +510,7 @@ vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -560,7 +560,7 @@ vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -592,7 +592,7 @@ vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x1 diff --git a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S index 8fa53efa7..8d3964aee 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S @@ -103,7 +103,7 @@ vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x2 @@ -177,7 +177,7 @@ vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -226,7 +226,7 @@ vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -262,7 +262,7 @@ vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x2 @@ -306,7 +306,7 @@ vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x1 @@ -347,7 +347,7 @@ vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -377,7 +377,7 @@ vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -402,7 +402,7 @@ vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x1 From 6d8095bcb9947290d3e3da04c20b0db9b7e08c6f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 7 Nov 2013 13:06:42 +0800 Subject: [PATCH 09/11] Avoid argument list too long issue in make clean. --- Makefile.tail | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.tail b/Makefile.tail index 53dd0caad..56f8d820c 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -606,7 +606,8 @@ clean :: @if test -d $(ARCH); then \ (cd $(ARCH) && $(MAKE) clean) \ fi - @rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \ + @find . -name '*.o' | xargs rm -rf + @rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \ *.csx *.is *~ *.exe *.flame *.pdb *.dwf \ gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ *.pc *.pcl *.def *.i *.prof linktest.c \ From a2942456ef7c88fba952851bdd94c1951c59dd4d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 13 Nov 2013 10:00:18 +0800 Subject: [PATCH 10/11] Refs #307. Fixed the hang bug when free OpenBLAS dll in Windows. --- driver/others/blas_server_win32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bd1069c5e..8723a6fa7 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){ if (blas_server_avail){ SetEvent(pool.killed); - + printf("blas_num_threads=%d\n", blas_num_threads); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], INFINITE); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); + TerminateThread(blas_threads[i],0); } blas_server_avail = 0; From 5048a80032a9d020b585ffc59323274f1d14e6b7 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 14 Nov 2013 13:46:42 +0800 Subject: [PATCH 11/11] Refs #283. Fixed the incorrect usage of long data type for Windows 64. --- Makefile.system | 8 ++++++++ driver/others/memory.c | 8 ++++---- lapack-netlib/lapacke/include/lapacke_config.h | 4 ++++ lapack/getrf/getrf_parallel.c | 14 +++++++------- lapack/getrf/getrf_parallel_omp.c | 2 +- lapack/getrf/getrf_single.c | 2 +- lapack/potrf/potrf_parallel.c | 2 +- 7 files changed, 26 insertions(+), 14 deletions(-) diff --git a/Makefile.system b/Makefile.system index da6a6a560..7da074a65 100644 --- a/Makefile.system +++ b/Makefile.system @@ -229,6 +229,11 @@ endif endif endif +# ifeq logical or +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) +OS_WINDOWS=1 +endif + ifdef QUAD_PRECISION CCOMMON_OPT += -DQUAD_PRECISION NO_EXPRECISION = 1 @@ -849,6 +854,9 @@ LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 LAPACK_CFLAGS += -DLAPACK_ILP64 endif +ifdef OS_WINDOWS +LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS +endif ifeq ($(C_COMPILER), LSB) LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE endif diff --git a/driver/others/memory.c b/driver/others/memory.c index 4f35691ff..35758d13c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -363,7 +363,7 @@ static void *alloc_mmap(void *address){ #define BENCH_ITERATION 4 #define SCALING 2 -static inline BLASULONG run_bench(BLASULONG address, long size) { +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { BLASULONG original, *p; BLASULONG start, stop, min; @@ -450,12 +450,12 @@ static void *alloc_mmap(void *address){ current = (SCALING - 1) * BUFFER_SIZE; while(current > 0) { - *(long *)start = (long)start + PAGESIZE; + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; } - *(long *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; start = (BLASULONG)map_address; @@ -1170,7 +1170,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) - long size; + size_t size; BLASULONG buffer; size = BUFFER_SIZE - PAGESIZE; diff --git a/lapack-netlib/lapacke/include/lapacke_config.h b/lapack-netlib/lapacke/include/lapacke_config.h index 1e2509bf0..561b2736b 100644 --- a/lapack-netlib/lapacke/include/lapacke_config.h +++ b/lapack-netlib/lapacke/include/lapacke_config.h @@ -45,7 +45,11 @@ extern "C" { #ifndef lapack_int #if defined(LAPACK_ILP64) +#if defined(OPENBLAS_OS_WINDOWS) +#define lapack_int long long +#else #define lapack_int long +#endif #else #define lapack_int int #endif diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 21ea9d5f5..3dbc70e9d 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,14 +67,14 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 -static inline long FORMULA1(long M, long N, long IS, long BK, long T) { +static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); double n = (double)(N - IS - BK); double b = (double)BK; double a = (double)T; - return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); + return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); } @@ -111,7 +111,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } @@ -221,7 +221,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } @@ -448,7 +448,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); is = 0; num_cpu = 0; @@ -685,7 +685,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (width > n - init_bk) width = n - init_bk; if (width < init_bk) { - long temp; + BLASLONG temp; temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); @@ -708,7 +708,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, is = 0; num_cpu = 0; - sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); while (is < mn) { diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 4922b9b52..6eda30a52 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -178,7 +178,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, return info; } - sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index fcea0ae89..f1818ea97 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -82,7 +82,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, return info; } - sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index eec9b6e05..11f7f533c 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -185,7 +185,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; }