Merge remote branch 'origin/develop' into piledriver
This commit is contained in:
		
						commit
						e09dc279a2
					
				
							
								
								
									
										8
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										8
									
								
								Makefile
								
								
								
								
							| 
						 | 
				
			
			@ -219,10 +219,10 @@ prof_lapack : lapack_prebuild
 | 
			
		|||
lapack_prebuild :
 | 
			
		||||
ifndef NOFORTRAN
 | 
			
		||||
	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "OPTS        = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "POPTS       = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "NOOPT       = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "PNOOPT      = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "NOOPT       = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -23,8 +23,8 @@ install : 	lib.grd
 | 
			
		|||
#for inc 
 | 
			
		||||
	@echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
	@echo \#endif  \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -229,6 +229,11 @@ endif
 | 
			
		|||
endif
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
# ifeq logical or
 | 
			
		||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
 | 
			
		||||
OS_WINDOWS=1
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifdef QUAD_PRECISION
 | 
			
		||||
CCOMMON_OPT	+= -DQUAD_PRECISION
 | 
			
		||||
NO_EXPRECISION = 1
 | 
			
		||||
| 
						 | 
				
			
			@ -470,10 +475,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
 | 
			
		|||
FCOMMON_OPT += -Wall
 | 
			
		||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
 | 
			
		||||
ifneq ($(NO_LAPACK), 1)
 | 
			
		||||
ifneq ($(C_COMPILER), LSB)
 | 
			
		||||
EXTRALIB += -lgfortran 
 | 
			
		||||
endif
 | 
			
		||||
endif
 | 
			
		||||
ifdef NO_BINARY_MODE
 | 
			
		||||
ifeq ($(ARCH), mips64)
 | 
			
		||||
ifdef BINARY64
 | 
			
		||||
| 
						 | 
				
			
			@ -842,11 +845,18 @@ override FFLAGS     += $(COMMON_OPT) $(FCOMMON_OPT)
 | 
			
		|||
override FPFLAGS    += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
 | 
			
		||||
#MAKEOVERRIDES =
 | 
			
		||||
 | 
			
		||||
#For LAPACK Fortran codes.
 | 
			
		||||
LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
 | 
			
		||||
LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
 | 
			
		||||
 | 
			
		||||
LAPACK_CFLAGS = $(CFLAGS)
 | 
			
		||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H 
 | 
			
		||||
ifdef INTERFACE64
 | 
			
		||||
LAPACK_CFLAGS +=  -DLAPACK_ILP64
 | 
			
		||||
endif
 | 
			
		||||
ifdef OS_WINDOWS
 | 
			
		||||
LAPACK_CFLAGS +=  -DOPENBLAS_OS_WINDOWS
 | 
			
		||||
endif
 | 
			
		||||
ifeq ($(C_COMPILER), LSB)
 | 
			
		||||
LAPACK_CFLAGS +=  -DLAPACK_COMPLEX_STRUCTURE
 | 
			
		||||
endif
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -606,7 +606,8 @@ clean ::
 | 
			
		|||
	@if test -d $(ARCH); then \
 | 
			
		||||
	(cd $(ARCH) && $(MAKE) clean) \
 | 
			
		||||
	fi
 | 
			
		||||
	@rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \
 | 
			
		||||
	@find . -name '*.o' | xargs rm -rf
 | 
			
		||||
	@rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \
 | 
			
		||||
	*.csx *.is *~ *.exe *.flame *.pdb *.dwf \
 | 
			
		||||
	gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \
 | 
			
		||||
	*.pc *.pcl *.def *.i *.prof linktest.c \
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
 | 
			
		|||
  if (blas_server_avail){
 | 
			
		||||
 | 
			
		||||
    SetEvent(pool.killed);
 | 
			
		||||
    
 | 
			
		||||
    printf("blas_num_threads=%d\n", blas_num_threads);
 | 
			
		||||
    for(i = 0; i < blas_num_threads - 1; i++){
 | 
			
		||||
      WaitForSingleObject(blas_threads[i], INFINITE);
 | 
			
		||||
     WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
 | 
			
		||||
	 TerminateThread(blas_threads[i],0);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    blas_server_avail = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -363,7 +363,7 @@ static void *alloc_mmap(void *address){
 | 
			
		|||
#define BENCH_ITERATION 4
 | 
			
		||||
#define SCALING		2
 | 
			
		||||
 | 
			
		||||
static inline BLASULONG run_bench(BLASULONG address, long size) {
 | 
			
		||||
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
 | 
			
		||||
 | 
			
		||||
  BLASULONG original, *p;
 | 
			
		||||
  BLASULONG start, stop, min;
 | 
			
		||||
| 
						 | 
				
			
			@ -450,12 +450,12 @@ static void *alloc_mmap(void *address){
 | 
			
		|||
	current = (SCALING - 1) * BUFFER_SIZE;
 | 
			
		||||
	
 | 
			
		||||
	while(current > 0) {
 | 
			
		||||
	  *(long *)start = (long)start + PAGESIZE;
 | 
			
		||||
	  *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
 | 
			
		||||
	  start += PAGESIZE;
 | 
			
		||||
	  current -= PAGESIZE;
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	*(long *)(start - PAGESIZE) = (BLASULONG)map_address;
 | 
			
		||||
	*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
 | 
			
		||||
	
 | 
			
		||||
	start = (BLASULONG)map_address;
 | 
			
		||||
	
 | 
			
		||||
| 
						 | 
				
			
			@ -1170,7 +1170,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
 | 
			
		|||
 | 
			
		||||
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
 | 
			
		||||
 | 
			
		||||
  long size;
 | 
			
		||||
  size_t size;
 | 
			
		||||
  BLASULONG buffer;
 | 
			
		||||
 | 
			
		||||
  size   = BUFFER_SIZE - PAGESIZE;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ libgoto_hpl.def : gensymbol
 | 
			
		|||
	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)
 | 
			
		||||
 | 
			
		||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
 | 
			
		||||
	$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 | 
			
		||||
	$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 | 
			
		||||
 | 
			
		||||
symbol.$(SUFFIX) : symbol.S
 | 
			
		||||
	$(CC) $(CFLAGS) -c -o $(@F) $^
 | 
			
		||||
| 
						 | 
				
			
			@ -124,14 +124,17 @@ ifeq ($(OSNAME), Linux)
 | 
			
		|||
so : ../$(LIBSONAME)
 | 
			
		||||
 | 
			
		||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
 | 
			
		||||
ifneq ($(C_COMPILER), LSB)
 | 
			
		||||
	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
 | 
			
		||||
	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
 | 
			
		||||
	-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
 | 
			
		||||
ifneq ($(C_COMPILER), LSB)
 | 
			
		||||
	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 | 
			
		||||
else
 | 
			
		||||
#Use FC on LSB
 | 
			
		||||
	$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 | 
			
		||||
#for LSB
 | 
			
		||||
	env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
 | 
			
		||||
	-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
 | 
			
		||||
	-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
 | 
			
		||||
	$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
 | 
			
		||||
endif
 | 
			
		||||
	rm -f linktest
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
 | 
			
		|||
};
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);
 | 
			
		||||
 | 
			
		||||
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
 | 
			
		|||
  if (args.nthreads == 1) {
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if DOUBLE
 | 
			
		||||
    // double trtri_U single thread error
 | 
			
		||||
    // call dtrtri from lapack for a walk around.
 | 
			
		||||
    if(uplo==0){
 | 
			
		||||
      BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info);
 | 
			
		||||
#ifndef PPC440
 | 
			
		||||
      blas_memory_free(buffer);
 | 
			
		||||
#endif
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
 | 
			
		||||
    
 | 
			
		||||
#ifdef SMP
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -103,7 +103,7 @@
 | 
			
		|||
	vmovups		-10*SIZE(AO,%rax,8), %xmm6
 | 
			
		||||
	vfmaddpd	%xmm14, %xmm6 , %xmm1 , %xmm14
 | 
			
		||||
	vfmaddpd	%xmm15, %xmm6 , %xmm2 , %xmm15
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_8x2
 | 
			
		||||
| 
						 | 
				
			
			@ -265,7 +265,7 @@
 | 
			
		|||
	vmovups		-14*SIZE(AO,%rax,4), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm10, %xmm0 , %xmm1 , %xmm10
 | 
			
		||||
	vfmaddpd	%xmm11, %xmm0 , %xmm2 , %xmm11
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -338,7 +338,7 @@
 | 
			
		|||
	vmovups		-16*SIZE(AO,%rax,2), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	vfmaddpd	%xmm9 , %xmm0 , %xmm2 , %xmm9
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -378,7 +378,7 @@
 | 
			
		|||
	vmovups 	-16*SIZE(BO,%rax,2), %xmm1
 | 
			
		||||
	vmovddup	-16*SIZE(AO,%rax,1), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_1x2
 | 
			
		||||
| 
						 | 
				
			
			@ -411,7 +411,7 @@
 | 
			
		|||
	vfmaddpd	%xmm10, %xmm0 , %xmm1 , %xmm10
 | 
			
		||||
	vmovups		-10*SIZE(AO,%rax,8), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm11, %xmm0 , %xmm1 , %xmm11
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_8x1
 | 
			
		||||
| 
						 | 
				
			
			@ -510,7 +510,7 @@
 | 
			
		|||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	vmovups		-14*SIZE(AO,%rax,4), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm9 , %xmm0 , %xmm1 , %xmm9
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -560,7 +560,7 @@
 | 
			
		|||
	vmovddup	-16*SIZE(BO,%rax,1), %xmm1
 | 
			
		||||
	vmovups		-16*SIZE(AO,%rax,2), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -592,7 +592,7 @@
 | 
			
		|||
	vmovsd  	-16*SIZE(BO,%rax,1), %xmm1
 | 
			
		||||
	vmovsd 		-16*SIZE(AO,%rax,1), %xmm0
 | 
			
		||||
	vfmaddsd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_1x1
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -103,7 +103,7 @@
 | 
			
		|||
	vmovups		-10*SIZE(AO,%rax,8), %xmm6
 | 
			
		||||
	vfmaddpd	%xmm14, %xmm6 , %xmm1 , %xmm14
 | 
			
		||||
	vfmaddpd	%xmm15, %xmm6 , %xmm2 , %xmm15
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_8x2
 | 
			
		||||
| 
						 | 
				
			
			@ -177,7 +177,7 @@
 | 
			
		|||
	vmovups		-14*SIZE(AO,%rax,4), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm10, %xmm0 , %xmm1 , %xmm10
 | 
			
		||||
	vfmaddpd	%xmm11, %xmm0 , %xmm2 , %xmm11
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -226,7 +226,7 @@
 | 
			
		|||
	vmovups		-16*SIZE(AO,%rax,2), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	vfmaddpd	%xmm9 , %xmm0 , %xmm2 , %xmm9
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -262,7 +262,7 @@
 | 
			
		|||
	vmovups 	-16*SIZE(BO,%rax,2), %xmm1
 | 
			
		||||
	vmovddup	-16*SIZE(AO,%rax,1), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_1x2
 | 
			
		||||
| 
						 | 
				
			
			@ -306,7 +306,7 @@
 | 
			
		|||
	vfmaddpd	%xmm10, %xmm0 , %xmm1 , %xmm10
 | 
			
		||||
	vmovups		-10*SIZE(AO,%rax,8), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm11, %xmm0 , %xmm1 , %xmm11
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_8x1
 | 
			
		||||
| 
						 | 
				
			
			@ -347,7 +347,7 @@
 | 
			
		|||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	vmovups		-14*SIZE(AO,%rax,4), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm9 , %xmm0 , %xmm1 , %xmm9
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -377,7 +377,7 @@
 | 
			
		|||
	vmovddup	-16*SIZE(BO,%rax,1), %xmm1
 | 
			
		||||
	vmovups		-16*SIZE(AO,%rax,2), %xmm0
 | 
			
		||||
	vfmaddpd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -402,7 +402,7 @@
 | 
			
		|||
	vmovsd  	-16*SIZE(BO,%rax,1), %xmm1
 | 
			
		||||
	vmovsd 		-16*SIZE(AO,%rax,1), %xmm0
 | 
			
		||||
	vfmaddsd	%xmm8 , %xmm0 , %xmm1 , %xmm8
 | 
			
		||||
	addq    $SIZE, %rax
 | 
			
		||||
	addq    $ SIZE, %rax
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
.macro SOLVE_1x1
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -45,7 +45,11 @@ extern "C" {
 | 
			
		|||
 | 
			
		||||
#ifndef lapack_int
 | 
			
		||||
#if defined(LAPACK_ILP64)
 | 
			
		||||
#if defined(OPENBLAS_OS_WINDOWS)
 | 
			
		||||
#define lapack_int              long long
 | 
			
		||||
#else
 | 
			
		||||
#define lapack_int              long
 | 
			
		||||
#endif
 | 
			
		||||
#else
 | 
			
		||||
#define lapack_int              int
 | 
			
		||||
#endif
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -67,14 +67,14 @@ double sqrt(double);
 | 
			
		|||
#undef  GETRF_FACTOR
 | 
			
		||||
#define GETRF_FACTOR 1.00
 | 
			
		||||
 | 
			
		||||
static inline long FORMULA1(long M, long N, long IS, long BK, long T) {
 | 
			
		||||
static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
 | 
			
		||||
 | 
			
		||||
  double m = (double)(M - IS - BK);
 | 
			
		||||
  double n = (double)(N - IS - BK);
 | 
			
		||||
  double b = (double)BK;
 | 
			
		||||
  double a = (double)T;
 | 
			
		||||
 | 
			
		||||
  return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a);
 | 
			
		||||
  return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a);
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
 | 
			
		|||
 | 
			
		||||
  if (args -> a == NULL) {
 | 
			
		||||
    TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
 | 
			
		||||
    sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
    sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  } else {
 | 
			
		||||
    sb  = (FLOAT *)args -> a;
 | 
			
		||||
  }
 | 
			
		||||
| 
						 | 
				
			
			@ -221,7 +221,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
 | 
			
		|||
 | 
			
		||||
  if (args -> a == NULL) {
 | 
			
		||||
    TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
 | 
			
		||||
    sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
    sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  } else {
 | 
			
		||||
    sb  = (FLOAT *)args -> a;
 | 
			
		||||
  }
 | 
			
		||||
| 
						 | 
				
			
			@ -448,7 +448,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
 | 
			
		||||
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
 | 
			
		||||
 | 
			
		||||
  sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  
 | 
			
		||||
  is = 0;
 | 
			
		||||
  num_cpu = 0;
 | 
			
		||||
| 
						 | 
				
			
			@ -685,7 +685,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
  if (width > n - init_bk) width = n - init_bk;
 | 
			
		||||
 | 
			
		||||
  if (width < init_bk) {
 | 
			
		||||
    long temp;
 | 
			
		||||
    BLASLONG temp;
 | 
			
		||||
 | 
			
		||||
    temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
 | 
			
		||||
    temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
 | 
			
		||||
| 
						 | 
				
			
			@ -708,7 +708,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
  is = 0;
 | 
			
		||||
  num_cpu = 0;
 | 
			
		||||
 | 
			
		||||
  sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
 | 
			
		||||
  while (is < mn) {
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -178,7 +178,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
    return info;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
 | 
			
		||||
  info = 0;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -82,7 +82,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
    return info;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
 | 
			
		||||
  info = 0;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -185,7 +185,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 | 
			
		|||
 | 
			
		||||
  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
 | 
			
		||||
 | 
			
		||||
  buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
 | 
			
		||||
  for (i = 1; i < DIVIDE_RATE; i++) {
 | 
			
		||||
    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
 | 
			
		||||
  }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,7 +13,6 @@ ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_sing
 | 
			
		|||
 | 
			
		||||
XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX)
 | 
			
		||||
 | 
			
		||||
DBLASOBJS += dtrtri_lapack.$(SUFFIX)
 | 
			
		||||
 | 
			
		||||
ifdef SMP
 | 
			
		||||
SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX)
 | 
			
		||||
| 
						 | 
				
			
			@ -54,9 +53,6 @@ dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c
 | 
			
		|||
dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c
 | 
			
		||||
	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F)
 | 
			
		||||
 | 
			
		||||
dtrtri_lapack.$(SUFFIX) : dtrtri_lapack.f
 | 
			
		||||
	$(FC) -c $(FFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F)
 | 
			
		||||
 | 
			
		||||
dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c
 | 
			
		||||
	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,242 +0,0 @@
 | 
			
		|||
*> \brief \b DTRTRI
 | 
			
		||||
*
 | 
			
		||||
*  =========== DOCUMENTATION ===========
 | 
			
		||||
*
 | 
			
		||||
* Online html documentation available at 
 | 
			
		||||
*            http://www.netlib.org/lapack/explore-html/ 
 | 
			
		||||
*
 | 
			
		||||
*> \htmlonly
 | 
			
		||||
*> Download DTRTRI + dependencies 
 | 
			
		||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dtrtri.f"> 
 | 
			
		||||
*> [TGZ]</a> 
 | 
			
		||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dtrtri.f"> 
 | 
			
		||||
*> [ZIP]</a> 
 | 
			
		||||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dtrtri.f"> 
 | 
			
		||||
*> [TXT]</a>
 | 
			
		||||
*> \endhtmlonly 
 | 
			
		||||
*
 | 
			
		||||
*  Definition:
 | 
			
		||||
*  ===========
 | 
			
		||||
*
 | 
			
		||||
*       SUBROUTINE DTRTRI( UPLO, DIAG, N, A, LDA, INFO )
 | 
			
		||||
* 
 | 
			
		||||
*       .. Scalar Arguments ..
 | 
			
		||||
*       CHARACTER          DIAG, UPLO
 | 
			
		||||
*       INTEGER            INFO, LDA, N
 | 
			
		||||
*       ..
 | 
			
		||||
*       .. Array Arguments ..
 | 
			
		||||
*       DOUBLE PRECISION   A( LDA, * )
 | 
			
		||||
*       ..
 | 
			
		||||
*  
 | 
			
		||||
*
 | 
			
		||||
*> \par Purpose:
 | 
			
		||||
*  =============
 | 
			
		||||
*>
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>
 | 
			
		||||
*> DTRTRI computes the inverse of a real upper or lower triangular
 | 
			
		||||
*> matrix A.
 | 
			
		||||
*>
 | 
			
		||||
*> This is the Level 3 BLAS version of the algorithm.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*
 | 
			
		||||
*  Arguments:
 | 
			
		||||
*  ==========
 | 
			
		||||
*
 | 
			
		||||
*> \param[in] UPLO
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          UPLO is CHARACTER*1
 | 
			
		||||
*>          = 'U':  A is upper triangular;
 | 
			
		||||
*>          = 'L':  A is lower triangular.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*>
 | 
			
		||||
*> \param[in] DIAG
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          DIAG is CHARACTER*1
 | 
			
		||||
*>          = 'N':  A is non-unit triangular;
 | 
			
		||||
*>          = 'U':  A is unit triangular.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*>
 | 
			
		||||
*> \param[in] N
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          N is INTEGER
 | 
			
		||||
*>          The order of the matrix A.  N >= 0.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*>
 | 
			
		||||
*> \param[in,out] A
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          A is DOUBLE PRECISION array, dimension (LDA,N)
 | 
			
		||||
*>          On entry, the triangular matrix A.  If UPLO = 'U', the
 | 
			
		||||
*>          leading N-by-N upper triangular part of the array A contains
 | 
			
		||||
*>          the upper triangular matrix, and the strictly lower
 | 
			
		||||
*>          triangular part of A is not referenced.  If UPLO = 'L', the
 | 
			
		||||
*>          leading N-by-N lower triangular part of the array A contains
 | 
			
		||||
*>          the lower triangular matrix, and the strictly upper
 | 
			
		||||
*>          triangular part of A is not referenced.  If DIAG = 'U', the
 | 
			
		||||
*>          diagonal elements of A are also not referenced and are
 | 
			
		||||
*>          assumed to be 1.
 | 
			
		||||
*>          On exit, the (triangular) inverse of the original matrix, in
 | 
			
		||||
*>          the same storage format.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*>
 | 
			
		||||
*> \param[in] LDA
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          LDA is INTEGER
 | 
			
		||||
*>          The leading dimension of the array A.  LDA >= max(1,N).
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*>
 | 
			
		||||
*> \param[out] INFO
 | 
			
		||||
*> \verbatim
 | 
			
		||||
*>          INFO is INTEGER
 | 
			
		||||
*>          = 0: successful exit
 | 
			
		||||
*>          < 0: if INFO = -i, the i-th argument had an illegal value
 | 
			
		||||
*>          > 0: if INFO = i, A(i,i) is exactly zero.  The triangular
 | 
			
		||||
*>               matrix is singular and its inverse can not be computed.
 | 
			
		||||
*> \endverbatim
 | 
			
		||||
*
 | 
			
		||||
*  Authors:
 | 
			
		||||
*  ========
 | 
			
		||||
*
 | 
			
		||||
*> \author Univ. of Tennessee 
 | 
			
		||||
*> \author Univ. of California Berkeley 
 | 
			
		||||
*> \author Univ. of Colorado Denver 
 | 
			
		||||
*> \author NAG Ltd. 
 | 
			
		||||
*
 | 
			
		||||
*> \date November 2011
 | 
			
		||||
*
 | 
			
		||||
*> \ingroup doubleOTHERcomputational
 | 
			
		||||
*
 | 
			
		||||
*  =====================================================================
 | 
			
		||||
      SUBROUTINE DTRTRILAPACK( UPLO, DIAG, N, A, LDA, INFO )
 | 
			
		||||
*
 | 
			
		||||
*  -- LAPACK computational routine (version 3.4.0) --
 | 
			
		||||
*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 | 
			
		||||
*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 | 
			
		||||
*     November 2011
 | 
			
		||||
*
 | 
			
		||||
*     .. Scalar Arguments ..
 | 
			
		||||
      CHARACTER          DIAG, UPLO
 | 
			
		||||
      INTEGER            INFO, LDA, N
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. Array Arguments ..
 | 
			
		||||
      DOUBLE PRECISION   A( LDA, * )
 | 
			
		||||
*     ..
 | 
			
		||||
*
 | 
			
		||||
*  =====================================================================
 | 
			
		||||
*
 | 
			
		||||
*     .. Parameters ..
 | 
			
		||||
      DOUBLE PRECISION   ONE, ZERO
 | 
			
		||||
      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. Local Scalars ..
 | 
			
		||||
      LOGICAL            NOUNIT, UPPER
 | 
			
		||||
      INTEGER            J, JB, NB, NN
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. External Functions ..
 | 
			
		||||
      LOGICAL            LSAME
 | 
			
		||||
      INTEGER            ILAENV
 | 
			
		||||
      EXTERNAL           LSAME, ILAENV
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. External Subroutines ..
 | 
			
		||||
      EXTERNAL           DTRMM, DTRSM, DTRTI2, XERBLA
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. Intrinsic Functions ..
 | 
			
		||||
      INTRINSIC          MAX, MIN
 | 
			
		||||
*     ..
 | 
			
		||||
*     .. Executable Statements ..
 | 
			
		||||
*
 | 
			
		||||
*     Test the input parameters.
 | 
			
		||||
*
 | 
			
		||||
      INFO = 0
 | 
			
		||||
      UPPER = LSAME( UPLO, 'U' )
 | 
			
		||||
      NOUNIT = LSAME( DIAG, 'N' )
 | 
			
		||||
      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
 | 
			
		||||
         INFO = -1
 | 
			
		||||
      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
 | 
			
		||||
         INFO = -2
 | 
			
		||||
      ELSE IF( N.LT.0 ) THEN
 | 
			
		||||
         INFO = -3
 | 
			
		||||
      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
 | 
			
		||||
         INFO = -5
 | 
			
		||||
      END IF
 | 
			
		||||
      IF( INFO.NE.0 ) THEN
 | 
			
		||||
         CALL XERBLA( 'DTRTRI', -INFO )
 | 
			
		||||
         RETURN
 | 
			
		||||
      END IF
 | 
			
		||||
*
 | 
			
		||||
*     Quick return if possible
 | 
			
		||||
*
 | 
			
		||||
      IF( N.EQ.0 )
 | 
			
		||||
     $   RETURN
 | 
			
		||||
*
 | 
			
		||||
*     Check for singularity if non-unit.
 | 
			
		||||
*
 | 
			
		||||
      IF( NOUNIT ) THEN
 | 
			
		||||
         DO 10 INFO = 1, N
 | 
			
		||||
            IF( A( INFO, INFO ).EQ.ZERO )
 | 
			
		||||
     $         RETURN
 | 
			
		||||
   10    CONTINUE
 | 
			
		||||
         INFO = 0
 | 
			
		||||
      END IF
 | 
			
		||||
*
 | 
			
		||||
*     Determine the block size for this environment.
 | 
			
		||||
*
 | 
			
		||||
      NB = ILAENV( 1, 'DTRTRI', UPLO // DIAG, N, -1, -1, -1 )
 | 
			
		||||
      IF( NB.LE.1 .OR. NB.GE.N ) THEN
 | 
			
		||||
*
 | 
			
		||||
*        Use unblocked code
 | 
			
		||||
*
 | 
			
		||||
         CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO )
 | 
			
		||||
      ELSE
 | 
			
		||||
*
 | 
			
		||||
*        Use blocked code
 | 
			
		||||
*
 | 
			
		||||
         IF( UPPER ) THEN
 | 
			
		||||
*
 | 
			
		||||
*           Compute inverse of upper triangular matrix
 | 
			
		||||
*
 | 
			
		||||
            DO 20 J = 1, N, NB
 | 
			
		||||
               JB = MIN( NB, N-J+1 )
 | 
			
		||||
*
 | 
			
		||||
*              Compute rows 1:j-1 of current block column
 | 
			
		||||
*
 | 
			
		||||
               CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1,
 | 
			
		||||
     $                     JB, ONE, A, LDA, A( 1, J ), LDA )
 | 
			
		||||
               CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1,
 | 
			
		||||
     $                     JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA )
 | 
			
		||||
*
 | 
			
		||||
*              Compute inverse of current diagonal block
 | 
			
		||||
*
 | 
			
		||||
               CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO )
 | 
			
		||||
   20       CONTINUE
 | 
			
		||||
         ELSE
 | 
			
		||||
*
 | 
			
		||||
*           Compute inverse of lower triangular matrix
 | 
			
		||||
*
 | 
			
		||||
            NN = ( ( N-1 ) / NB )*NB + 1
 | 
			
		||||
            DO 30 J = NN, 1, -NB
 | 
			
		||||
               JB = MIN( NB, N-J+1 )
 | 
			
		||||
               IF( J+JB.LE.N ) THEN
 | 
			
		||||
*
 | 
			
		||||
*                 Compute rows j+jb:n of current block column
 | 
			
		||||
*
 | 
			
		||||
                  CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG,
 | 
			
		||||
     $                        N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA,
 | 
			
		||||
     $                        A( J+JB, J ), LDA )
 | 
			
		||||
                  CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG,
 | 
			
		||||
     $                        N-J-JB+1, JB, -ONE, A( J, J ), LDA,
 | 
			
		||||
     $                        A( J+JB, J ), LDA )
 | 
			
		||||
               END IF
 | 
			
		||||
*
 | 
			
		||||
*              Compute inverse of current diagonal block
 | 
			
		||||
*
 | 
			
		||||
               CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO )
 | 
			
		||||
   30       CONTINUE
 | 
			
		||||
         END IF
 | 
			
		||||
      END IF
 | 
			
		||||
*
 | 
			
		||||
      RETURN
 | 
			
		||||
*
 | 
			
		||||
*     End of DTRTRI
 | 
			
		||||
*
 | 
			
		||||
      END
 | 
			
		||||
| 
						 | 
				
			
			@ -127,7 +127,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
	    if (min_i > GEMM_P) min_i = GEMM_P;
 | 
			
		||||
	    
 | 
			
		||||
	  if (ls == i + bk) {
 | 
			
		||||
	    NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
 | 
			
		||||
	    //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
 | 
			
		||||
 | 
			
		||||
	    GEMM_BETA(min_i, bk, 0, dm1, 
 | 
			
		||||
#ifdef COMPLEX
 | 
			
		||||
		      ZERO, 
 | 
			
		||||
#endif
 | 
			
		||||
		      NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
 | 
			
		||||
 | 
			
		||||
	    TRSM_KERNEL_RN(min_i, bk, bk, dm1, 
 | 
			
		||||
#ifdef COMPLEX
 | 
			
		||||
| 
						 | 
				
			
			@ -171,7 +177,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
			
		|||
	  min_i = i - is;
 | 
			
		||||
	  if (min_i > GEMM_P) min_i = GEMM_P;
 | 
			
		||||
	  
 | 
			
		||||
	  NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
 | 
			
		||||
	  //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
 | 
			
		||||
	  GEMM_BETA(min_i, bk, 0, dm1, 
 | 
			
		||||
#ifdef COMPLEX
 | 
			
		||||
		    ZERO, 
 | 
			
		||||
#endif
 | 
			
		||||
		    NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
 | 
			
		||||
 | 
			
		||||
	  TRSM_KERNEL_RN(min_i, bk, bk, dm1, 
 | 
			
		||||
#ifdef COMPLEX
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,8 +1,8 @@
 | 
			
		|||
/*This is only for "make install" target.*/
 | 
			
		||||
 | 
			
		||||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
 | 
			
		||||
#define WINDOWS_ABI
 | 
			
		||||
#define OS_WINDOWS
 | 
			
		||||
#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
 | 
			
		||||
#define OPENBLAS_WINDOWS_ABI
 | 
			
		||||
#define OPENBLAS_OS_WINDOWS
 | 
			
		||||
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
#define DOUBLE_DEFINED DOUBLE
 | 
			
		||||
| 
						 | 
				
			
			@ -10,23 +10,23 @@
 | 
			
		|||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef NEEDBUNDERSCORE
 | 
			
		||||
#ifdef OPENBLAS_NEEDBUNDERSCORE
 | 
			
		||||
#define BLASFUNC(FUNC) FUNC##_
 | 
			
		||||
#else
 | 
			
		||||
#define BLASFUNC(FUNC) FUNC
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef QUAD_PRECISION
 | 
			
		||||
#ifdef OPENBLAS_QUAD_PRECISION
 | 
			
		||||
typedef struct {
 | 
			
		||||
  unsigned long x[2];
 | 
			
		||||
}  xdouble;
 | 
			
		||||
#elif defined EXPRECISION
 | 
			
		||||
#elif defined OPENBLAS_EXPRECISION
 | 
			
		||||
#define xdouble long double
 | 
			
		||||
#else
 | 
			
		||||
#define xdouble double
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
 | 
			
		||||
#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
 | 
			
		||||
typedef long long BLASLONG;
 | 
			
		||||
typedef unsigned long long BLASULONG;
 | 
			
		||||
#else
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +34,7 @@ typedef long BLASLONG;
 | 
			
		|||
typedef unsigned long BLASULONG;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef USE64BITINT
 | 
			
		||||
#ifdef OPENBLAS_USE64BITINT
 | 
			
		||||
typedef BLASLONG blasint;
 | 
			
		||||
#else
 | 
			
		||||
typedef int blasint;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue