Merge remote branch 'origin/develop' into bulldozer
This commit is contained in:
commit
1596ced242
8
Makefile
8
Makefile
|
@ -219,10 +219,10 @@ prof_lapack : lapack_prebuild
|
||||||
lapack_prebuild :
|
lapack_prebuild :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
|
|
@ -23,8 +23,8 @@ install : lib.grd
|
||||||
#for inc
|
#for inc
|
||||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
@echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
@cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||||
|
|
||||||
|
|
|
@ -229,6 +229,11 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# ifeq logical or
|
||||||
|
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
|
||||||
|
OS_WINDOWS=1
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef QUAD_PRECISION
|
ifdef QUAD_PRECISION
|
||||||
CCOMMON_OPT += -DQUAD_PRECISION
|
CCOMMON_OPT += -DQUAD_PRECISION
|
||||||
NO_EXPRECISION = 1
|
NO_EXPRECISION = 1
|
||||||
|
@ -470,10 +475,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||||
FCOMMON_OPT += -Wall
|
FCOMMON_OPT += -Wall
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
ifneq ($(NO_LAPACK), 1)
|
ifneq ($(NO_LAPACK), 1)
|
||||||
ifneq ($(C_COMPILER), LSB)
|
|
||||||
EXTRALIB += -lgfortran
|
EXTRALIB += -lgfortran
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
ifdef NO_BINARY_MODE
|
ifdef NO_BINARY_MODE
|
||||||
ifeq ($(ARCH), mips64)
|
ifeq ($(ARCH), mips64)
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
|
@ -842,11 +845,18 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||||
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
|
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
|
||||||
#MAKEOVERRIDES =
|
#MAKEOVERRIDES =
|
||||||
|
|
||||||
|
#For LAPACK Fortran codes.
|
||||||
|
LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
|
||||||
|
LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
|
||||||
|
|
||||||
LAPACK_CFLAGS = $(CFLAGS)
|
LAPACK_CFLAGS = $(CFLAGS)
|
||||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
LAPACK_CFLAGS += -DLAPACK_ILP64
|
LAPACK_CFLAGS += -DLAPACK_ILP64
|
||||||
endif
|
endif
|
||||||
|
ifdef OS_WINDOWS
|
||||||
|
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
|
||||||
|
endif
|
||||||
ifeq ($(C_COMPILER), LSB)
|
ifeq ($(C_COMPILER), LSB)
|
||||||
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -606,7 +606,8 @@ clean ::
|
||||||
@if test -d $(ARCH); then \
|
@if test -d $(ARCH); then \
|
||||||
(cd $(ARCH) && $(MAKE) clean) \
|
(cd $(ARCH) && $(MAKE) clean) \
|
||||||
fi
|
fi
|
||||||
@rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \
|
@find . -name '*.o' | xargs rm -rf
|
||||||
|
@rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \
|
||||||
*.csx *.is *~ *.exe *.flame *.pdb *.dwf \
|
*.csx *.is *~ *.exe *.flame *.pdb *.dwf \
|
||||||
gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \
|
gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \
|
||||||
*.pc *.pcl *.def *.i *.prof linktest.c \
|
*.pc *.pcl *.def *.i *.prof linktest.c \
|
||||||
|
|
|
@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
if (blas_server_avail){
|
if (blas_server_avail){
|
||||||
|
|
||||||
SetEvent(pool.killed);
|
SetEvent(pool.killed);
|
||||||
|
printf("blas_num_threads=%d\n", blas_num_threads);
|
||||||
for(i = 0; i < blas_num_threads - 1; i++){
|
for(i = 0; i < blas_num_threads - 1; i++){
|
||||||
WaitForSingleObject(blas_threads[i], INFINITE);
|
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||||
|
TerminateThread(blas_threads[i],0);
|
||||||
}
|
}
|
||||||
|
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
|
@ -363,7 +363,7 @@ static void *alloc_mmap(void *address){
|
||||||
#define BENCH_ITERATION 4
|
#define BENCH_ITERATION 4
|
||||||
#define SCALING 2
|
#define SCALING 2
|
||||||
|
|
||||||
static inline BLASULONG run_bench(BLASULONG address, long size) {
|
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
|
||||||
|
|
||||||
BLASULONG original, *p;
|
BLASULONG original, *p;
|
||||||
BLASULONG start, stop, min;
|
BLASULONG start, stop, min;
|
||||||
|
@ -450,12 +450,12 @@ static void *alloc_mmap(void *address){
|
||||||
current = (SCALING - 1) * BUFFER_SIZE;
|
current = (SCALING - 1) * BUFFER_SIZE;
|
||||||
|
|
||||||
while(current > 0) {
|
while(current > 0) {
|
||||||
*(long *)start = (long)start + PAGESIZE;
|
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
|
||||||
start += PAGESIZE;
|
start += PAGESIZE;
|
||||||
current -= PAGESIZE;
|
current -= PAGESIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
*(long *)(start - PAGESIZE) = (BLASULONG)map_address;
|
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
|
||||||
|
|
||||||
start = (BLASULONG)map_address;
|
start = (BLASULONG)map_address;
|
||||||
|
|
||||||
|
@ -1170,7 +1170,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
|
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
|
||||||
|
|
||||||
long size;
|
size_t size;
|
||||||
BLASULONG buffer;
|
BLASULONG buffer;
|
||||||
|
|
||||||
size = BUFFER_SIZE - PAGESIZE;
|
size = BUFFER_SIZE - PAGESIZE;
|
||||||
|
|
|
@ -111,7 +111,7 @@ libgoto_hpl.def : gensymbol
|
||||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)
|
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)
|
||||||
|
|
||||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
|
|
||||||
symbol.$(SUFFIX) : symbol.S
|
symbol.$(SUFFIX) : symbol.S
|
||||||
$(CC) $(CFLAGS) -c -o $(@F) $^
|
$(CC) $(CFLAGS) -c -o $(@F) $^
|
||||||
|
@ -124,14 +124,17 @@ ifeq ($(OSNAME), Linux)
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||||
|
ifneq ($(C_COMPILER), LSB)
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||||
ifneq ($(C_COMPILER), LSB)
|
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
else
|
else
|
||||||
#Use FC on LSB
|
#for LSB
|
||||||
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
|
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||||
|
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
endif
|
endif
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
|
@ -145,7 +148,7 @@ so : ../$(LIBSONAME)
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
|
$(FEXTRALIB) $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);
|
|
||||||
|
|
||||||
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||||
|
|
||||||
|
@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if DOUBLE
|
|
||||||
// double trtri_U single thread error
|
|
||||||
// call dtrtri from lapack for a walk around.
|
|
||||||
if(uplo==0){
|
|
||||||
BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info);
|
|
||||||
#ifndef PPC440
|
|
||||||
blas_memory_free(buffer);
|
|
||||||
#endif
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
|
@ -171,7 +171,7 @@
|
||||||
.L0x:
|
.L0x:
|
||||||
xorq M,M
|
xorq M,M
|
||||||
addq $1,M
|
addq $1,M
|
||||||
salq $22,M
|
salq $21,M
|
||||||
subq M,MMM
|
subq M,MMM
|
||||||
jge .L00
|
jge .L00
|
||||||
|
|
||||||
|
|
|
@ -103,7 +103,7 @@
|
||||||
vmovups -10*SIZE(AO,%rax,8), %xmm6
|
vmovups -10*SIZE(AO,%rax,8), %xmm6
|
||||||
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14
|
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14
|
||||||
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15
|
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_8x2
|
.macro SOLVE_8x2
|
||||||
|
@ -265,7 +265,7 @@
|
||||||
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
||||||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
||||||
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11
|
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -338,7 +338,7 @@
|
||||||
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9
|
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -378,7 +378,7 @@
|
||||||
vmovups -16*SIZE(BO,%rax,2), %xmm1
|
vmovups -16*SIZE(BO,%rax,2), %xmm1
|
||||||
vmovddup -16*SIZE(AO,%rax,1), %xmm0
|
vmovddup -16*SIZE(AO,%rax,1), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_1x2
|
.macro SOLVE_1x2
|
||||||
|
@ -411,7 +411,7 @@
|
||||||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
||||||
vmovups -10*SIZE(AO,%rax,8), %xmm0
|
vmovups -10*SIZE(AO,%rax,8), %xmm0
|
||||||
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11
|
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_8x1
|
.macro SOLVE_8x1
|
||||||
|
@ -510,7 +510,7 @@
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
||||||
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9
|
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -560,7 +560,7 @@
|
||||||
vmovddup -16*SIZE(BO,%rax,1), %xmm1
|
vmovddup -16*SIZE(BO,%rax,1), %xmm1
|
||||||
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -592,7 +592,7 @@
|
||||||
vmovsd -16*SIZE(BO,%rax,1), %xmm1
|
vmovsd -16*SIZE(BO,%rax,1), %xmm1
|
||||||
vmovsd -16*SIZE(AO,%rax,1), %xmm0
|
vmovsd -16*SIZE(AO,%rax,1), %xmm0
|
||||||
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_1x1
|
.macro SOLVE_1x1
|
||||||
|
|
|
@ -103,7 +103,7 @@
|
||||||
vmovups -10*SIZE(AO,%rax,8), %xmm6
|
vmovups -10*SIZE(AO,%rax,8), %xmm6
|
||||||
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14
|
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14
|
||||||
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15
|
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_8x2
|
.macro SOLVE_8x2
|
||||||
|
@ -177,7 +177,7 @@
|
||||||
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
||||||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
||||||
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11
|
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -226,7 +226,7 @@
|
||||||
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9
|
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -262,7 +262,7 @@
|
||||||
vmovups -16*SIZE(BO,%rax,2), %xmm1
|
vmovups -16*SIZE(BO,%rax,2), %xmm1
|
||||||
vmovddup -16*SIZE(AO,%rax,1), %xmm0
|
vmovddup -16*SIZE(AO,%rax,1), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_1x2
|
.macro SOLVE_1x2
|
||||||
|
@ -306,7 +306,7 @@
|
||||||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10
|
||||||
vmovups -10*SIZE(AO,%rax,8), %xmm0
|
vmovups -10*SIZE(AO,%rax,8), %xmm0
|
||||||
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11
|
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_8x1
|
.macro SOLVE_8x1
|
||||||
|
@ -347,7 +347,7 @@
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
vmovups -14*SIZE(AO,%rax,4), %xmm0
|
||||||
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9
|
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -377,7 +377,7 @@
|
||||||
vmovddup -16*SIZE(BO,%rax,1), %xmm1
|
vmovddup -16*SIZE(BO,%rax,1), %xmm1
|
||||||
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
vmovups -16*SIZE(AO,%rax,2), %xmm0
|
||||||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -402,7 +402,7 @@
|
||||||
vmovsd -16*SIZE(BO,%rax,1), %xmm1
|
vmovsd -16*SIZE(BO,%rax,1), %xmm1
|
||||||
vmovsd -16*SIZE(AO,%rax,1), %xmm0
|
vmovsd -16*SIZE(AO,%rax,1), %xmm0
|
||||||
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8
|
||||||
addq $SIZE, %rax
|
addq $ SIZE, %rax
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SOLVE_1x1
|
.macro SOLVE_1x1
|
||||||
|
|
|
@ -45,7 +45,11 @@ extern "C" {
|
||||||
|
|
||||||
#ifndef lapack_int
|
#ifndef lapack_int
|
||||||
#if defined(LAPACK_ILP64)
|
#if defined(LAPACK_ILP64)
|
||||||
|
#if defined(OPENBLAS_OS_WINDOWS)
|
||||||
|
#define lapack_int long long
|
||||||
|
#else
|
||||||
#define lapack_int long
|
#define lapack_int long
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
#define lapack_int int
|
#define lapack_int int
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -67,14 +67,14 @@ double sqrt(double);
|
||||||
#undef GETRF_FACTOR
|
#undef GETRF_FACTOR
|
||||||
#define GETRF_FACTOR 1.00
|
#define GETRF_FACTOR 1.00
|
||||||
|
|
||||||
static inline long FORMULA1(long M, long N, long IS, long BK, long T) {
|
static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
||||||
|
|
||||||
double m = (double)(M - IS - BK);
|
double m = (double)(M - IS - BK);
|
||||||
double n = (double)(N - IS - BK);
|
double n = (double)(N - IS - BK);
|
||||||
double b = (double)BK;
|
double b = (double)BK;
|
||||||
double a = (double)T;
|
double a = (double)T;
|
||||||
|
|
||||||
return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a);
|
return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
|
|
||||||
if (args -> a == NULL) {
|
if (args -> a == NULL) {
|
||||||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
||||||
sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
} else {
|
} else {
|
||||||
sb = (FLOAT *)args -> a;
|
sb = (FLOAT *)args -> a;
|
||||||
}
|
}
|
||||||
|
@ -221,7 +221,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
|
|
||||||
if (args -> a == NULL) {
|
if (args -> a == NULL) {
|
||||||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
||||||
sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
} else {
|
} else {
|
||||||
sb = (FLOAT *)args -> a;
|
sb = (FLOAT *)args -> a;
|
||||||
}
|
}
|
||||||
|
@ -448,7 +448,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
|
|
||||||
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
|
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
|
||||||
|
|
||||||
sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
|
||||||
is = 0;
|
is = 0;
|
||||||
num_cpu = 0;
|
num_cpu = 0;
|
||||||
|
@ -685,7 +685,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
if (width > n - init_bk) width = n - init_bk;
|
if (width > n - init_bk) width = n - init_bk;
|
||||||
|
|
||||||
if (width < init_bk) {
|
if (width < init_bk) {
|
||||||
long temp;
|
BLASLONG temp;
|
||||||
|
|
||||||
temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
|
temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
|
||||||
temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
|
temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
|
||||||
|
@ -708,7 +708,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
is = 0;
|
is = 0;
|
||||||
num_cpu = 0;
|
num_cpu = 0;
|
||||||
|
|
||||||
sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
|
||||||
while (is < mn) {
|
while (is < mn) {
|
||||||
|
|
||||||
|
|
|
@ -178,7 +178,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
|
||||||
|
|
||||||
buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||||
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,6 @@ ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_sing
|
||||||
|
|
||||||
XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX)
|
XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX)
|
||||||
|
|
||||||
DBLASOBJS += dtrtri_lapack.$(SUFFIX)
|
|
||||||
|
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX)
|
SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX)
|
||||||
|
@ -54,9 +53,6 @@ dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c
|
||||||
dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c
|
dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c
|
||||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F)
|
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F)
|
||||||
|
|
||||||
dtrtri_lapack.$(SUFFIX) : dtrtri_lapack.f
|
|
||||||
$(FC) -c $(FFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F)
|
|
||||||
|
|
||||||
dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c
|
dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c
|
||||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F)
|
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -1,242 +0,0 @@
|
||||||
*> \brief \b DTRTRI
|
|
||||||
*
|
|
||||||
* =========== DOCUMENTATION ===========
|
|
||||||
*
|
|
||||||
* Online html documentation available at
|
|
||||||
* http://www.netlib.org/lapack/explore-html/
|
|
||||||
*
|
|
||||||
*> \htmlonly
|
|
||||||
*> Download DTRTRI + dependencies
|
|
||||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dtrtri.f">
|
|
||||||
*> [TGZ]</a>
|
|
||||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dtrtri.f">
|
|
||||||
*> [ZIP]</a>
|
|
||||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dtrtri.f">
|
|
||||||
*> [TXT]</a>
|
|
||||||
*> \endhtmlonly
|
|
||||||
*
|
|
||||||
* Definition:
|
|
||||||
* ===========
|
|
||||||
*
|
|
||||||
* SUBROUTINE DTRTRI( UPLO, DIAG, N, A, LDA, INFO )
|
|
||||||
*
|
|
||||||
* .. Scalar Arguments ..
|
|
||||||
* CHARACTER DIAG, UPLO
|
|
||||||
* INTEGER INFO, LDA, N
|
|
||||||
* ..
|
|
||||||
* .. Array Arguments ..
|
|
||||||
* DOUBLE PRECISION A( LDA, * )
|
|
||||||
* ..
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*> \par Purpose:
|
|
||||||
* =============
|
|
||||||
*>
|
|
||||||
*> \verbatim
|
|
||||||
*>
|
|
||||||
*> DTRTRI computes the inverse of a real upper or lower triangular
|
|
||||||
*> matrix A.
|
|
||||||
*>
|
|
||||||
*> This is the Level 3 BLAS version of the algorithm.
|
|
||||||
*> \endverbatim
|
|
||||||
*
|
|
||||||
* Arguments:
|
|
||||||
* ==========
|
|
||||||
*
|
|
||||||
*> \param[in] UPLO
|
|
||||||
*> \verbatim
|
|
||||||
*> UPLO is CHARACTER*1
|
|
||||||
*> = 'U': A is upper triangular;
|
|
||||||
*> = 'L': A is lower triangular.
|
|
||||||
*> \endverbatim
|
|
||||||
*>
|
|
||||||
*> \param[in] DIAG
|
|
||||||
*> \verbatim
|
|
||||||
*> DIAG is CHARACTER*1
|
|
||||||
*> = 'N': A is non-unit triangular;
|
|
||||||
*> = 'U': A is unit triangular.
|
|
||||||
*> \endverbatim
|
|
||||||
*>
|
|
||||||
*> \param[in] N
|
|
||||||
*> \verbatim
|
|
||||||
*> N is INTEGER
|
|
||||||
*> The order of the matrix A. N >= 0.
|
|
||||||
*> \endverbatim
|
|
||||||
*>
|
|
||||||
*> \param[in,out] A
|
|
||||||
*> \verbatim
|
|
||||||
*> A is DOUBLE PRECISION array, dimension (LDA,N)
|
|
||||||
*> On entry, the triangular matrix A. If UPLO = 'U', the
|
|
||||||
*> leading N-by-N upper triangular part of the array A contains
|
|
||||||
*> the upper triangular matrix, and the strictly lower
|
|
||||||
*> triangular part of A is not referenced. If UPLO = 'L', the
|
|
||||||
*> leading N-by-N lower triangular part of the array A contains
|
|
||||||
*> the lower triangular matrix, and the strictly upper
|
|
||||||
*> triangular part of A is not referenced. If DIAG = 'U', the
|
|
||||||
*> diagonal elements of A are also not referenced and are
|
|
||||||
*> assumed to be 1.
|
|
||||||
*> On exit, the (triangular) inverse of the original matrix, in
|
|
||||||
*> the same storage format.
|
|
||||||
*> \endverbatim
|
|
||||||
*>
|
|
||||||
*> \param[in] LDA
|
|
||||||
*> \verbatim
|
|
||||||
*> LDA is INTEGER
|
|
||||||
*> The leading dimension of the array A. LDA >= max(1,N).
|
|
||||||
*> \endverbatim
|
|
||||||
*>
|
|
||||||
*> \param[out] INFO
|
|
||||||
*> \verbatim
|
|
||||||
*> INFO is INTEGER
|
|
||||||
*> = 0: successful exit
|
|
||||||
*> < 0: if INFO = -i, the i-th argument had an illegal value
|
|
||||||
*> > 0: if INFO = i, A(i,i) is exactly zero. The triangular
|
|
||||||
*> matrix is singular and its inverse can not be computed.
|
|
||||||
*> \endverbatim
|
|
||||||
*
|
|
||||||
* Authors:
|
|
||||||
* ========
|
|
||||||
*
|
|
||||||
*> \author Univ. of Tennessee
|
|
||||||
*> \author Univ. of California Berkeley
|
|
||||||
*> \author Univ. of Colorado Denver
|
|
||||||
*> \author NAG Ltd.
|
|
||||||
*
|
|
||||||
*> \date November 2011
|
|
||||||
*
|
|
||||||
*> \ingroup doubleOTHERcomputational
|
|
||||||
*
|
|
||||||
* =====================================================================
|
|
||||||
SUBROUTINE DTRTRILAPACK( UPLO, DIAG, N, A, LDA, INFO )
|
|
||||||
*
|
|
||||||
* -- LAPACK computational routine (version 3.4.0) --
|
|
||||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
|
||||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
|
||||||
* November 2011
|
|
||||||
*
|
|
||||||
* .. Scalar Arguments ..
|
|
||||||
CHARACTER DIAG, UPLO
|
|
||||||
INTEGER INFO, LDA, N
|
|
||||||
* ..
|
|
||||||
* .. Array Arguments ..
|
|
||||||
DOUBLE PRECISION A( LDA, * )
|
|
||||||
* ..
|
|
||||||
*
|
|
||||||
* =====================================================================
|
|
||||||
*
|
|
||||||
* .. Parameters ..
|
|
||||||
DOUBLE PRECISION ONE, ZERO
|
|
||||||
PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 )
|
|
||||||
* ..
|
|
||||||
* .. Local Scalars ..
|
|
||||||
LOGICAL NOUNIT, UPPER
|
|
||||||
INTEGER J, JB, NB, NN
|
|
||||||
* ..
|
|
||||||
* .. External Functions ..
|
|
||||||
LOGICAL LSAME
|
|
||||||
INTEGER ILAENV
|
|
||||||
EXTERNAL LSAME, ILAENV
|
|
||||||
* ..
|
|
||||||
* .. External Subroutines ..
|
|
||||||
EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA
|
|
||||||
* ..
|
|
||||||
* .. Intrinsic Functions ..
|
|
||||||
INTRINSIC MAX, MIN
|
|
||||||
* ..
|
|
||||||
* .. Executable Statements ..
|
|
||||||
*
|
|
||||||
* Test the input parameters.
|
|
||||||
*
|
|
||||||
INFO = 0
|
|
||||||
UPPER = LSAME( UPLO, 'U' )
|
|
||||||
NOUNIT = LSAME( DIAG, 'N' )
|
|
||||||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
|
|
||||||
INFO = -1
|
|
||||||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
|
|
||||||
INFO = -2
|
|
||||||
ELSE IF( N.LT.0 ) THEN
|
|
||||||
INFO = -3
|
|
||||||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
|
|
||||||
INFO = -5
|
|
||||||
END IF
|
|
||||||
IF( INFO.NE.0 ) THEN
|
|
||||||
CALL XERBLA( 'DTRTRI', -INFO )
|
|
||||||
RETURN
|
|
||||||
END IF
|
|
||||||
*
|
|
||||||
* Quick return if possible
|
|
||||||
*
|
|
||||||
IF( N.EQ.0 )
|
|
||||||
$ RETURN
|
|
||||||
*
|
|
||||||
* Check for singularity if non-unit.
|
|
||||||
*
|
|
||||||
IF( NOUNIT ) THEN
|
|
||||||
DO 10 INFO = 1, N
|
|
||||||
IF( A( INFO, INFO ).EQ.ZERO )
|
|
||||||
$ RETURN
|
|
||||||
10 CONTINUE
|
|
||||||
INFO = 0
|
|
||||||
END IF
|
|
||||||
*
|
|
||||||
* Determine the block size for this environment.
|
|
||||||
*
|
|
||||||
NB = ILAENV( 1, 'DTRTRI', UPLO // DIAG, N, -1, -1, -1 )
|
|
||||||
IF( NB.LE.1 .OR. NB.GE.N ) THEN
|
|
||||||
*
|
|
||||||
* Use unblocked code
|
|
||||||
*
|
|
||||||
CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO )
|
|
||||||
ELSE
|
|
||||||
*
|
|
||||||
* Use blocked code
|
|
||||||
*
|
|
||||||
IF( UPPER ) THEN
|
|
||||||
*
|
|
||||||
* Compute inverse of upper triangular matrix
|
|
||||||
*
|
|
||||||
DO 20 J = 1, N, NB
|
|
||||||
JB = MIN( NB, N-J+1 )
|
|
||||||
*
|
|
||||||
* Compute rows 1:j-1 of current block column
|
|
||||||
*
|
|
||||||
CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1,
|
|
||||||
$ JB, ONE, A, LDA, A( 1, J ), LDA )
|
|
||||||
CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1,
|
|
||||||
$ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA )
|
|
||||||
*
|
|
||||||
* Compute inverse of current diagonal block
|
|
||||||
*
|
|
||||||
CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO )
|
|
||||||
20 CONTINUE
|
|
||||||
ELSE
|
|
||||||
*
|
|
||||||
* Compute inverse of lower triangular matrix
|
|
||||||
*
|
|
||||||
NN = ( ( N-1 ) / NB )*NB + 1
|
|
||||||
DO 30 J = NN, 1, -NB
|
|
||||||
JB = MIN( NB, N-J+1 )
|
|
||||||
IF( J+JB.LE.N ) THEN
|
|
||||||
*
|
|
||||||
* Compute rows j+jb:n of current block column
|
|
||||||
*
|
|
||||||
CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG,
|
|
||||||
$ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA,
|
|
||||||
$ A( J+JB, J ), LDA )
|
|
||||||
CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG,
|
|
||||||
$ N-J-JB+1, JB, -ONE, A( J, J ), LDA,
|
|
||||||
$ A( J+JB, J ), LDA )
|
|
||||||
END IF
|
|
||||||
*
|
|
||||||
* Compute inverse of current diagonal block
|
|
||||||
*
|
|
||||||
CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO )
|
|
||||||
30 CONTINUE
|
|
||||||
END IF
|
|
||||||
END IF
|
|
||||||
*
|
|
||||||
RETURN
|
|
||||||
*
|
|
||||||
* End of DTRTRI
|
|
||||||
*
|
|
||||||
END
|
|
|
@ -127,7 +127,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
if (min_i > GEMM_P) min_i = GEMM_P;
|
if (min_i > GEMM_P) min_i = GEMM_P;
|
||||||
|
|
||||||
if (ls == i + bk) {
|
if (ls == i + bk) {
|
||||||
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
|
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
|
||||||
|
|
||||||
|
GEMM_BETA(min_i, bk, 0, dm1,
|
||||||
|
#ifdef COMPLEX
|
||||||
|
ZERO,
|
||||||
|
#endif
|
||||||
|
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
|
||||||
|
|
||||||
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
|
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
|
||||||
#ifdef COMPLEX
|
#ifdef COMPLEX
|
||||||
|
@ -171,7 +177,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
min_i = i - is;
|
min_i = i - is;
|
||||||
if (min_i > GEMM_P) min_i = GEMM_P;
|
if (min_i > GEMM_P) min_i = GEMM_P;
|
||||||
|
|
||||||
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
|
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
|
||||||
|
GEMM_BETA(min_i, bk, 0, dm1,
|
||||||
|
#ifdef COMPLEX
|
||||||
|
ZERO,
|
||||||
|
#endif
|
||||||
|
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
|
||||||
|
|
||||||
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
|
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
|
||||||
#ifdef COMPLEX
|
#ifdef COMPLEX
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/*This is only for "make install" target.*/
|
/*This is only for "make install" target.*/
|
||||||
|
|
||||||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
|
#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
|
||||||
#define WINDOWS_ABI
|
#define OPENBLAS_WINDOWS_ABI
|
||||||
#define OS_WINDOWS
|
#define OPENBLAS_OS_WINDOWS
|
||||||
|
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
#define DOUBLE_DEFINED DOUBLE
|
#define DOUBLE_DEFINED DOUBLE
|
||||||
|
@ -10,23 +10,23 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEEDBUNDERSCORE
|
#ifdef OPENBLAS_NEEDBUNDERSCORE
|
||||||
#define BLASFUNC(FUNC) FUNC##_
|
#define BLASFUNC(FUNC) FUNC##_
|
||||||
#else
|
#else
|
||||||
#define BLASFUNC(FUNC) FUNC
|
#define BLASFUNC(FUNC) FUNC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef QUAD_PRECISION
|
#ifdef OPENBLAS_QUAD_PRECISION
|
||||||
typedef struct {
|
typedef struct {
|
||||||
unsigned long x[2];
|
unsigned long x[2];
|
||||||
} xdouble;
|
} xdouble;
|
||||||
#elif defined EXPRECISION
|
#elif defined OPENBLAS_EXPRECISION
|
||||||
#define xdouble long double
|
#define xdouble long double
|
||||||
#else
|
#else
|
||||||
#define xdouble double
|
#define xdouble double
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
|
||||||
typedef long long BLASLONG;
|
typedef long long BLASLONG;
|
||||||
typedef unsigned long long BLASULONG;
|
typedef unsigned long long BLASULONG;
|
||||||
#else
|
#else
|
||||||
|
@ -34,7 +34,7 @@ typedef long BLASLONG;
|
||||||
typedef unsigned long BLASULONG;
|
typedef unsigned long BLASULONG;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE64BITINT
|
#ifdef OPENBLAS_USE64BITINT
|
||||||
typedef BLASLONG blasint;
|
typedef BLASLONG blasint;
|
||||||
#else
|
#else
|
||||||
typedef int blasint;
|
typedef int blasint;
|
||||||
|
|
Loading…
Reference in New Issue