Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2012-08-20 16:52:35 +08:00
commit 48f075cfd5
29 changed files with 2854 additions and 734 deletions

View File

@ -1,4 +1,17 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
* Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
* Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
==================================================================== ====================================================================
Version 0.2.2 Version 0.2.2
6-July-2012 6-July-2012

View File

@ -3,7 +3,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others BLASDIRS = interface driver/level2 driver/level3 driver/others
ifndef DYNAMIC_ARCH ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel BLASDIRS += kernel
endif endif
@ -99,11 +99,9 @@ ifeq ($(OSNAME), Darwin)
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif endif
tests : tests :
@ -147,7 +145,7 @@ ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h echo "#define EXPRECISION">> config_last.h
endif endif
## ##
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1 $(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \ for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -165,7 +163,7 @@ prof_blas :
$(MAKE) -C $$d prof || exit 1 ; \ $(MAKE) -C $$d prof || exit 1 ; \
fi; \ fi; \
done done
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonprof || exit 1 $(MAKE) -C kernel commonprof || exit 1
endif endif
@ -184,7 +182,7 @@ hpl :
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \ fi; \
done done
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1 $(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \ for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -233,7 +231,7 @@ ifndef NOFORTRAN
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif endif

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.2.2 VERSION = 0.2.3
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -108,19 +108,16 @@ VERSION = 0.2.2
# The installation directory. # The installation directory.
# PREFIX = /opt/OpenBLAS # PREFIX = /opt/OpenBLAS
# Common Optimization Flag; -O2 is enough. # Common Optimization Flag;
# DEBUG = 1 # The default -O2 is enough.
# COMMON_OPT = -O2
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
# -DDEBUG
else
COMMON_OPT += -O2
endif
# Profiling flags # Profiling flags
COMMON_PROF = -pg COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# #
# End of user configuration # End of user configuration
# #

View File

@ -244,7 +244,7 @@ endif
endif endif
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
@ -687,6 +687,15 @@ AWK = awk
REVISION = -r$(VERSION) REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@ -705,7 +714,7 @@ ifndef LIBSUFFIX
LIBSUFFIX = a LIBSUFFIX = a
endif endif
ifndef DYNAMIC_ARCH ifneq ($(DYNAMIC_ARCH), 1)
ifndef SMP ifndef SMP
LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
@ -724,8 +733,8 @@ endif
endif endif
LIBDLLNAME = $(LIBPREFIX).dll
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll)
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@ -100,6 +100,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. * The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches ## Specification of Git Branches

View File

@ -45,7 +45,7 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint); int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void BLASFUNC(openblas_set_num_threads)(int *); void openblas_set_num_threads_(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);

View File

@ -14,7 +14,7 @@ endif
# COMMONOBJS += info.$(SUFFIX) # COMMONOBJS += info.$(SUFFIX)
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
COMMONOBJS += dynamic.$(SUFFIX) COMMONOBJS += dynamic.$(SUFFIX)
else else
COMMONOBJS += parameter.$(SUFFIX) COMMONOBJS += parameter.$(SUFFIX)
@ -70,7 +70,7 @@ ifndef BLAS_SERVER
BLAS_SERVER = blas_server.c BLAS_SERVER = blas_server.c
endif endif
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
else else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)

View File

@ -435,7 +435,7 @@ static int blas_thread_server(void *arg){
blas_memory_free(buffer); blas_memory_free(buffer);
pthread_exit(NULL); //pthread_exit(NULL);
return 0; return 0;
} }

View File

@ -63,13 +63,7 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER]; static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER];
void goto_set_num_threads(int num)
{
}
void openblas_set_num_threads(int num)
{
}
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
@ -187,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
do { do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break; if (action == WAIT_OBJECT_0 + 1) break;
@ -271,7 +265,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else { } else {
legacy_exec(routine, queue -> mode, queue -> args, sb); legacy_exec(routine, queue -> mode, queue -> args, sb);
} }
} }else{
continue; //if queue == NULL
}
#ifdef SMP_DEBUG #ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
@ -433,7 +429,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
/* Shutdown procedure, but user don't have to call this routine. The */ /* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */ /* kernel automatically kill threads. */
int blas_thread_shutdown_(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i; int i;
@ -445,7 +441,7 @@ int blas_thread_shutdown_(void){
SetEvent(pool.killed); SetEvent(pool.killed);
for(i = 0; i < blas_cpu_number - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], INFINITE); WaitForSingleObject(blas_threads[i], INFINITE);
} }
@ -456,3 +452,47 @@ int blas_thread_shutdown_(void){
return 0; return 0;
} }
void goto_set_num_threads(int num_threads)
{
long i;
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

View File

@ -60,6 +60,8 @@ extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BOBCAT;
#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
@ -122,15 +124,24 @@ static gotoblas_t *get_coretype(void){
if (model == 12) return &gotoblas_ATOM; if (model == 12) return &gotoblas_ATOM;
return NULL; return NULL;
case 2: case 2:
//Intel Core (Clarkdale) / Core (Arrandale) //Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm // Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM; if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP) //Intel Xeon Processor 5600 (Westmere-EP)
if (model == 12) return &gotoblas_NEHALEM; //Xeon Processor E7 (Westmere-EX)
return NULL; if (model == 12 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) return &gotoblas_SANDYBRIDGE;
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10) return &gotoblas_SANDYBRIDGE;
return NULL;
} }
case 0xf: case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD; if (model <= 0x2) return &gotoblas_NORTHWOOD;
@ -144,7 +155,9 @@ static gotoblas_t *get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) { if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON; else return &gotoblas_OPTERON;
} else { } else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
} }
@ -178,6 +191,8 @@ static char *corename[] = {
"Opteron(SSE3)", "Opteron(SSE3)",
"Barcelona", "Barcelona",
"Nano", "Nano",
"Sandybridge",
"Bobcat",
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -197,6 +212,8 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
return corename[0]; return corename[0];
} }

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -206,7 +206,15 @@ int get_num_procs(void) {
#endif #endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0; int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0; int blas_num_threads = 0;
int goto_get_num_procs (void) { int goto_get_num_procs (void) {
@ -1289,6 +1297,7 @@ void DESTRUCTOR gotoblas_quit(void) {
moncontrol (1); moncontrol (1);
#endif #endif
blas_shutdown();
} }
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern void openblas_set_num_threads(int num_threads) ; extern void openblas_set_num_threads(int num_threads) ;
void NAME(int* num_threads){ void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads); openblas_set_num_threads(*num_threads);
} }
@ -46,7 +46,7 @@ void NAME(int* num_threads){
void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads(int num_threads) {
} }
void NAME(int* num_threads){ void openblas_set_num_threads_(int* num_threads){
} }
#endif #endif

View File

@ -66,6 +66,11 @@ dll : ../$(LIBDLLNAME)
dll2 : libgoto2_shared.dll dll2 : libgoto2_shared.dll
# On Windows, we only generate a DLL without a version suffix. This is because
# applications which link against the dynamic library reference a fixed DLL name
# in their import table. By instead using a stable name it is possible to
# upgrade between library versions, without needing to re-link an application.
# For more details see: https://github.com/xianyi/OpenBLAS/issues/127.
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME) $(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1) ifeq ($(BINARY32), 1)

View File

@ -48,7 +48,7 @@ HPLOBJS = \
COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX)
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX)
endif endif

View File

@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
min_jj = js + min_j - jjs; min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (GEMM_UNROLL_N <= 8) { if (0 && GEMM_UNROLL_N <= 8) {
LASWP_NCOPY(min_jj, off + 1, off + k, LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda, c + (- off + jjs * lda) * COMPSIZE, lda,
@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (GEMM_UNROLL_N <= 8) { if (0 && GEMM_UNROLL_N <= 8) {
printf("helllo\n");
LASWP_NCOPY(min_jj, off + 1, off + k, LASWP_NCOPY(min_jj, off + 1, off + k,
b + (- off + jjs * lda) * COMPSIZE, lda, b + (- off + jjs * lda) * COMPSIZE, lda,

View File

@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_jj = js + min_j - jjs; min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#if 0
LASWP_NCOPY(min_jj, off + 1, off + k, LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda, c + (- off + jjs * lda) * COMPSIZE, lda,
ipiv, sb + k * (jjs - js) * COMPSIZE); ipiv, sb + k * (jjs - js) * COMPSIZE);
#else
LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
#ifdef COMPLEX
ZERO,
#endif
c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE);
#endif
for (is = 0; is < k; is += GEMM_P) { for (is = 0; is < k; is += GEMM_P) {
min_i = k - is; min_i = k - is;
if (min_i > GEMM_P) min_i = GEMM_P; if (min_i > GEMM_P) min_i = GEMM_P;

View File

@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
min_jj = js + jmin - jjs; min_jj = js + jmin - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#if 0 #if 1
LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO,
#ifdef COMPLEX #ifdef COMPLEX
ZERO, ZERO,

View File

@ -48,7 +48,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1; FLOAT *a1;
FLOAT *b1, *b2; FLOAT *b1, *b2;
@ -58,14 +58,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --; k1 --;
#ifndef MINUS #ifndef MINUS
ipiv += k1 ipiv += k1;
;
#else #else
ipiv -= (k2 - 1) * incx; ipiv -= (k2 - 1) * incx;
#endif #endif
if (n <= 0) return 0; if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
j = n; j = n;
if (j > 0) { if (j > 0) {
do { do {
@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = (rows >> 1);
if (i > 0) { i--;
do { //Main Loop
while (i > 0) {
#ifdef OPTERON #ifdef OPTERON
#ifndef MINUS #ifndef MINUS
asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a1 -= 2; a1 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
*a1 = B1; *a1 = B1;

View File

@ -50,7 +50,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1, *a3; FLOAT *a1, *a3;
FLOAT *b1, *b2, *b3, *b4; FLOAT *b1, *b2, *b3, *b4;
@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --; k1 --;
#ifndef MINUS #ifndef MINUS
ipiv += k1 ipiv += k1;
;
#else #else
ipiv -= (k2 - 1) * incx; ipiv -= (k2 - 1) * incx;
#endif #endif
@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (n <= 0) return 0; if (n <= 0) return 0;
j = (n >> 1); j = (n >> 1);
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
if (j > 0) { if (j > 0) {
do { do {
piv = ipiv; piv = ipiv;
@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b3 = b1 + 1 * lda; b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda; b4 = b2 + 1 * lda;
i = ((k2 - k1) >> 1); i = ((rows) >> 1);
if (i > 0) { // Loop pipeline
do { i--;
//Main Loop
while (i > 0) {
#ifdef CORE2 #ifdef CORE2
#ifndef MINUS #ifndef MINUS
asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a3 -= 2; a3 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
#else
a1 -= 2;
a3 -= 2;
#endif
//Remain
i = ((rows) & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
A3 = *a3; A3 = *a3;
@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = ((rows) >> 1);
i --;
if (i > 0) { while (i > 0) {
do { A1 = *a1;
A1 = *a1; A2 = *a2;
A2 = *a2; B1 = *b1;
B1 = *b1; B2 = *b2;
B2 = *b2;
ip1 = *piv; ip1 = *piv;
piv += incx; piv += incx;
ip2 = *piv; ip2 = *piv;
piv += incx; piv += incx;
if (b1 == a1) { if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) { if (b2 == a1) {
*a1 = A2; *a1 = A2;
*a2 = A1; *a2 = B1;
*b1 = A1;
} else } else
if (b2 != a2) { if (b2 == a2) {
*a2 = B2; *a1 = B1;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1; *b1 = A1;
} else } else
if (b2 == a2) { if (b2 == b1) {
*a1 = B1; *a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1; *b1 = A1;
} else *b2 = A2;
if (b2 == b1) { }
*a1 = B1; }
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
#ifndef MINUS #ifndef MINUS
a1 += 2; a1 += 2;
#else #else
a1 -= 2; a1 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
*a1 = B1; *a1 = B1;

View File

@ -54,7 +54,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1, *a3, *a5, *a7; FLOAT *a1, *a3, *a5, *a7;
FLOAT *b1, *b2, *b3, *b4; FLOAT *b1, *b2, *b3, *b4;
@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --; k1 --;
#ifndef MINUS #ifndef MINUS
ipiv += k1 ipiv += k1;
;
#else #else
ipiv -= (k2 - 1) * incx; ipiv -= (k2 - 1) * incx;
#endif #endif
if (n <= 0) return 0; if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 2); j = (n >> 2);
if (j > 0) { if (j > 0) {
do { do {
@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
i = ((k2 - k1) >> 1); i = ((k2 - k1) >> 1);
if (i > 0) { i--; //Loop pipeline
do { //Main Loop
while (i > 0) {
A1 = *a1; A1 = *a1;
A2 = *a2; A2 = *a2;
A3 = *a3; A3 = *a3;
@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a7 -= 2; a7 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
A5 = *a5;
A6 = *a6;
A7 = *a7;
A8 = *a8;
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
B5 = *b5;
B6 = *b6;
B7 = *b7;
B8 = *b8;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
*a5 = A6;
*a6 = A5;
*a7 = A8;
*a8 = A7;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
*a6 = B6;
*b6 = A6;
*a8 = B8;
*b8 = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
*a5 = A6;
*a6 = A5;
*a7 = A8;
*a8 = A7;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
*a5 = A6;
*a6 = B6;
*b6 = A5;
*a7 = A8;
*a8 = B8;
*b8 = A7;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
*a5 = A6;
*a6 = B5;
*b5 = A5;
*a7 = A8;
*a8 = B7;
*b7 = A7;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
*a5 = B5;
*b5 = A5;
*a7 = B7;
*b7 = A7;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
*a5 = B5;
*a6 = A5;
*b5 = A6;
*a7 = B7;
*a8 = A7;
*b7 = A8;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
*a5 = B5;
*a6 = B6;
*b5 = A5;
*b6 = A6;
*a7 = B7;
*a8 = B8;
*b7 = A7;
*b8 = A8;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
a5 += 2;
a7 += 2;
#else
a1 -= 2;
a3 -= 2;
a5 -= 2;
a7 -= 2;
#endif
//Remain
i = ((rows) & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
b5 = b1 + 2 * lda;
b7 = b1 + 3 * lda;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
A3 = *a3; A3 = *a3;
@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b3 = b1 + 1 * lda; b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda; b4 = b2 + 1 * lda;
i = ((k2 - k1) >> 1); i = ((rows) >> 1);
i--;
if (i > 0) { while (i > 0) {
do {
A1 = *a1; A1 = *a1;
A2 = *a2; A2 = *a2;
A3 = *a3; A3 = *a3;
@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a3 -= 2; a3 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
#else
a1 -= 2;
a3 -= 2;
#endif
i = ((rows) & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
A3 = *a3; A3 = *a3;
@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = ((rows) >> 1);
i --;
if (i > 0) { while (i > 0) {
do { A1 = *a1;
A1 = *a1; A2 = *a2;
A2 = *a2; B1 = *b1;
B1 = *b1; B2 = *b2;
B2 = *b2;
ip1 = *piv; ip1 = *piv;
piv += incx; piv += incx;
ip2 = *piv; ip2 = *piv;
piv += incx; piv += incx;
if (b1 == a1) { if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) { if (b2 == a1) {
*a1 = A2; *a1 = A2;
*a2 = A1; *a2 = B1;
*b1 = A1;
} else } else
if (b2 != a2) { if (b2 == a2) {
*a2 = B2; *a1 = B1;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1; *b1 = A1;
} else } else
if (b2 == a2) { if (b2 == b1) {
*a1 = B1; *a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1; *b1 = A1;
} else *b2 = A2;
if (b2 == b1) { }
*a1 = B1; }
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
#ifndef MINUS #ifndef MINUS
a1 += 2; a1 += 2;
#else #else
a1 -= 2; a1 -= 2;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1; A1 = *a1;
B1 = *b1; B1 = *b1;
*a1 = B1; *a1 = B1;

File diff suppressed because it is too large Load Diff

View File

@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1; FLOAT *a1;
FLOAT *b1, *b2; FLOAT *b1, *b2;
@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif #endif
if (n <= 0) return 0; if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = n; j = n;
if (j > 0) { if (j > 0) {
@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = ((k2 - k1) >> 1);
i --;
if (i > 0) { //Loop pipeline
do { //Main Loop
while (i > 0) {
#ifdef OPTERON #ifdef OPTERON
#ifndef MINUS #ifndef MINUS
asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4; a1 -= 4;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
B1 = *(b1 + 0); B1 = *(b1 + 0);

View File

@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1; FLOAT *a1;
FLOAT *b1, *b2; FLOAT *b1, *b2;
@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
if (n <= 0) return 0; if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 1); j = (n >> 1);
if (j > 0) { if (j > 0) {
@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = (rows >> 1);
i--;
if (i > 0) { //Loop pipeline
do { //Main Loop
while (i > 0) {
#ifdef CORE2 #ifdef CORE2
#ifndef MINUS #ifndef MINUS
asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4; a1 -= 4;
#endif #endif
i --; i --;
} while (i > 0);
} }
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
i = ((k2 - k1) & 1); A5 = *(a1 + 0 + lda);
A6 = *(a1 + 1 + lda);
A7 = *(a2 + 0 + lda);
A8 = *(a2 + 1 + lda);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b1 + 0 + lda);
B6 = *(b1 + 1 + lda);
B7 = *(b2 + 0 + lda);
B8 = *(b2 + 1 + lda);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b2 + 0 + lda) = A5;
*(b2 + 1 + lda) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = B5;
*(a2 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
*(b1 + 0 + lda) = A7;
*(b1 + 1 + lda) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a1 + 0 + lda); A3 = *(a1 + 0 + lda);
@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = (rows >> 1);
i--;
if (i > 0) { //Loop pipeline
do { //Main Loop
while (i > 0) {
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a2 + 0); A3 = *(a2 + 0);
@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4; a1 -= 4;
#endif #endif
i --; i --;
} while (i > 0);
} }
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
i = ((k2 - k1) & 1); if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
B1 = *(b1 + 0); B1 = *(b1 + 0);

View File

@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2; BLASLONG i, j, ip1, ip2, rows;
blasint *piv; blasint *piv;
FLOAT *a1, *a3, *a5, *a7; FLOAT *a1, *a3, *a5, *a7;
FLOAT *b1, *b2, *b3, *b4; FLOAT *b1, *b2, *b3, *b4;
@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif #endif
if (n <= 0) return 0; if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 2); j = (n >> 2);
if (j > 0) { if (j > 0) {
@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b7 = b1 + 3 * lda; b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda; b8 = b2 + 3 * lda;
i = ((k2 - k1) >> 1); i = (rows >> 1);
i--;
if (i > 0) { //Loop pipeline
do { //Main Loop
while (i > 0) {
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a2 + 0); A3 = *(a2 + 0);
@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a7 -= 4; a7 -= 4;
#endif #endif
i --; i --;
} while (i > 0);
} }
i = ((k2 - k1) & 1); //Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
A9 = *(a5 + 0);
A10 = *(a5 + 1);
A11 = *(a6 + 0);
A12 = *(a6 + 1);
A13 = *(a7 + 0);
A14 = *(a7 + 1);
A15 = *(a8 + 0);
A16 = *(a8 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
B9 = *(b5 + 0);
B10 = *(b5 + 1);
B11 = *(b6 + 0);
B12 = *(b6 + 1);
B13 = *(b7 + 0);
B14 = *(b7 + 1);
B15 = *(b8 + 0);
B16 = *(b8 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b6 + 0) = A11;
*(b6 + 1) = A12;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b6 + 0) = A9;
*(b6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b8 + 0) = A13;
*(b8 + 1) = A14;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = B9;
*(a6 + 1) = B10;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = B13;
*(a8 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(b5 + 0) = A11;
*(b5 + 1) = A12;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
*(b7 + 0) = A15;
*(b7 + 1) = A16;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(b6 + 0) = A11;
*(b6 + 1) = A12;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
}
#ifndef MINUS
a1 += 4;
a3 += 4;
a5 += 4;
a7 += 4;
#else
a1 -= 4;
a3 -= 4;
a5 -= 4;
a7 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
b3 = b1 + 1 * lda;
b5 = b1 + 2 * lda;
b7 = b1 + 3 * lda;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a3 + 0); A3 = *(a3 + 0);
@ -435,37 +717,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b3 = b1 + lda; b3 = b1 + lda;
b4 = b2 + lda; b4 = b2 + lda;
i = ((k2 - k1) >> 1); i = (rows >> 1);
i--;
if (i > 0) { //Loop pipeline
do { //Main Loop
A1 = *(a1 + 0); while (i > 0) {
A2 = *(a1 + 1); A1 = *(a1 + 0);
A3 = *(a2 + 0); A2 = *(a1 + 1);
A4 = *(a2 + 1); A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0); A5 = *(a3 + 0);
A6 = *(a3 + 1); A6 = *(a3 + 1);
A7 = *(a4 + 0); A7 = *(a4 + 0);
A8 = *(a4 + 1); A8 = *(a4 + 1);
B1 = *(b1 + 0); B1 = *(b1 + 0);
B2 = *(b1 + 1); B2 = *(b1 + 1);
B3 = *(b2 + 0); B3 = *(b2 + 0);
B4 = *(b2 + 1); B4 = *(b2 + 1);
B5 = *(b3 + 0); B5 = *(b3 + 0);
B6 = *(b3 + 1); B6 = *(b3 + 1);
B7 = *(b4 + 0); B7 = *(b4 + 0);
B8 = *(b4 + 1); B8 = *(b4 + 1);
ip1 = *piv * 2; ip1 = *piv * 2;
piv += incx; piv += incx;
ip2 = *piv * 2; ip2 = *piv * 2;
piv += incx; piv += incx;
if (b1 == a1) { if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) { if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
}
b1 = a + ip1;
b2 = a + ip2;
b3 = b1 + lda;
b4 = b2 + lda;
#ifndef MINUS
a1 += 4;
a3 += 4;
#else
a1 -= 4;
a3 -= 4;
#endif
i --;
}
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3; *(a1 + 0) = A3;
*(a1 + 1) = A4; *(a1 + 1) = A4;
*(a2 + 0) = A1; *(a2 + 0) = A1;
@ -474,122 +924,96 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = A8; *(a3 + 1) = A8;
*(a4 + 0) = A5; *(a4 + 0) = A5;
*(a4 + 1) = A6; *(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else } else
if (b2 != a2) { if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3; *(a2 + 0) = B3;
*(a2 + 1) = B4; *(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3; *(b2 + 0) = A3;
*(b2 + 1) = A4; *(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7; *(a4 + 0) = B7;
*(a4 + 1) = B8; *(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7; *(b4 + 0) = A7;
*(b4 + 1) = A8; *(b4 + 1) = A8;
} }
} else }
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
}
b1 = a + ip1;
b2 = a + ip2;
b3 = b1 + lda;
b4 = b2 + lda;
#ifndef MINUS #ifndef MINUS
a1 += 4; a1 += 4;
a3 += 4; a3 += 4;
#else #else
a1 -= 4; a1 -= 4;
a3 -= 4; a3 -= 4;
#endif #endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1); //Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
b3 = b1 + lda;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a3 + 0); A3 = *(a3 + 0);
@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1; b1 = a + ip1;
b2 = a + ip2; b2 = a + ip2;
i = ((k2 - k1) >> 1); i = (rows >> 1);
i--;
if (i > 0) { //Loop pipeline
do { //Main Loop
while (i > 0) {
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
A3 = *(a2 + 0); A3 = *(a2 + 0);
@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4; a1 -= 4;
#endif #endif
i --; i --;
} while (i > 0);
} }
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
i = ((k2 - k1) & 1); if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) { if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0); A1 = *(a1 + 0);
A2 = *(a1 + 1); A2 = *(a1 + 1);
B1 = *(b1 + 0); B1 = *(b1 + 0);

View File

@ -16,12 +16,17 @@ LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c
endif endif
ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif
ifndef LASWP ifndef LASWP
LASWP = ../generic/laswp_k_1.c LASWP = ../generic/laswp_k.c
endif endif
ifndef ZLASWP ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k_1.c ZLASWP = ../generic/zlaswp_k.c
endif endif
include ../generic/Makefile include ../generic/Makefile

View File

@ -21,12 +21,17 @@ LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c
endif endif
ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif
ifndef LASWP ifndef LASWP
LASWP = ../generic/laswp_k_1.c LASWP = ../generic/laswp_k.c
endif endif
ifndef ZLASWP ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k_1.c ZLASWP = ../generic/zlaswp_k.c
endif endif
include ../generic/Makefile include ../generic/Makefile

View File

@ -191,7 +191,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ + slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \
+ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ + slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \
+ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ + slarrv.$(SUFFIX) slartv.$(SUFFIX) \
+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ + slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \
+ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ + slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \
+ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ + sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \
+ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ + sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \
@ -345,7 +345,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ + clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \
+ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ + clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \
+ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ + clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \
+ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ + clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \
+ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ + clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \
+ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ + cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \
+ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ + cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \
@ -484,7 +484,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ + dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \
+ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ + dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \
+ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ + dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \
+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ + dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \
+ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ + dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \
+ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ + dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \
+ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ + dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \
@ -643,7 +643,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ + zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \
+ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ + zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \
+ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ + zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \
+ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \ + zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \
+ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ + zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \
+ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ + zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \
+ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ + zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \

View File

@ -1,3 +1,4 @@
#!/bin/bash #!/bin/bash
echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio "
make BINARY=32 CC=gcc FC=gfortran make BINARY=32 CC=gcc FC=gfortran

View File

@ -1,3 +1,4 @@
#!/bin/bash #!/bin/bash
echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio "
make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran