Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2012-08-20 16:52:35 +08:00
commit 48f075cfd5
29 changed files with 2854 additions and 734 deletions

View File

@ -1,4 +1,17 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
* Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
* Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
====================================================================
Version 0.2.2
6-July-2012

View File

@ -3,7 +3,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others
ifndef DYNAMIC_ARCH
ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel
endif
@ -99,11 +99,9 @@ ifeq ($(OSNAME), Darwin)
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif
tests :
@ -147,7 +145,7 @@ ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h
endif
##
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -165,7 +163,7 @@ prof_blas :
$(MAKE) -C $$d prof || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonprof || exit 1
endif
@ -184,7 +182,7 @@ hpl :
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -233,7 +231,7 @@ ifndef NOFORTRAN
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.2
VERSION = 0.2.3
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -108,19 +108,16 @@ VERSION = 0.2.2
# The installation directory.
# PREFIX = /opt/OpenBLAS
# Common Optimization Flag; -O2 is enough.
# DEBUG = 1
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
# -DDEBUG
else
COMMON_OPT += -O2
endif
# Common Optimization Flag;
# The default -O2 is enough.
# COMMON_OPT = -O2
# Profiling flags
COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
#
# End of user configuration
#

View File

@ -244,7 +244,7 @@ endif
endif
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
@ -687,6 +687,15 @@ AWK = awk
REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@ -705,7 +714,7 @@ ifndef LIBSUFFIX
LIBSUFFIX = a
endif
ifndef DYNAMIC_ARCH
ifneq ($(DYNAMIC_ARCH), 1)
ifndef SMP
LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
@ -724,8 +733,8 @@ endif
endif
LIBDLLNAME = $(LIBPREFIX).dll
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll)
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@ -100,6 +100,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches

View File

@ -45,7 +45,7 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void BLASFUNC(openblas_set_num_threads)(int *);
void openblas_set_num_threads_(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);

View File

@ -14,7 +14,7 @@ endif
# COMMONOBJS += info.$(SUFFIX)
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
COMMONOBJS += dynamic.$(SUFFIX)
else
COMMONOBJS += parameter.$(SUFFIX)
@ -70,7 +70,7 @@ ifndef BLAS_SERVER
BLAS_SERVER = blas_server.c
endif
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)

View File

@ -435,7 +435,7 @@ static int blas_thread_server(void *arg){
blas_memory_free(buffer);
pthread_exit(NULL);
//pthread_exit(NULL);
return 0;
}

View File

@ -63,13 +63,7 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
void goto_set_num_threads(int num)
{
}
void openblas_set_num_threads(int num)
{
}
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
@ -187,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1));
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break;
@ -271,7 +265,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else {
legacy_exec(routine, queue -> mode, queue -> args, sb);
}
}
}else{
continue; //if queue == NULL
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
@ -433,7 +429,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
/* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */
int blas_thread_shutdown_(void){
int BLASFUNC(blas_thread_shutdown)(void){
int i;
@ -445,7 +441,7 @@ int blas_thread_shutdown_(void){
SetEvent(pool.killed);
for(i = 0; i < blas_cpu_number - 1; i++){
for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], INFINITE);
}
@ -456,3 +452,47 @@ int blas_thread_shutdown_(void){
return 0;
}
void goto_set_num_threads(int num_threads)
{
long i;
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

View File

@ -60,6 +60,8 @@ extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BOBCAT;
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
@ -122,15 +124,24 @@ static gotoblas_t *get_coretype(void){
if (model == 12) return &gotoblas_ATOM;
return NULL;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP)
if (model == 12) return &gotoblas_NEHALEM;
return NULL;
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
if (model == 12 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) return &gotoblas_SANDYBRIDGE;
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10) return &gotoblas_SANDYBRIDGE;
return NULL;
}
case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD;
@ -144,7 +155,9 @@ static gotoblas_t *get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else {
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else {
return &gotoblas_BARCELONA;
}
}
@ -178,6 +191,8 @@ static char *corename[] = {
"Opteron(SSE3)",
"Barcelona",
"Nano",
"Sandybridge",
"Bobcat",
};
char *gotoblas_corename(void) {
@ -197,7 +212,9 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
return corename[0];
}

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -206,7 +206,15 @@ int get_num_procs(void) {
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int goto_get_num_procs (void) {
@ -1289,6 +1297,7 @@ void DESTRUCTOR gotoblas_quit(void) {
moncontrol (1);
#endif
blas_shutdown();
}
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern void openblas_set_num_threads(int num_threads) ;
void NAME(int* num_threads){
void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads);
}
@ -46,7 +46,7 @@ void NAME(int* num_threads){
void openblas_set_num_threads(int num_threads) {
}
void NAME(int* num_threads){
void openblas_set_num_threads_(int* num_threads){
}
#endif

View File

@ -66,6 +66,11 @@ dll : ../$(LIBDLLNAME)
dll2 : libgoto2_shared.dll
# On Windows, we only generate a DLL without a version suffix. This is because
# applications which link against the dynamic library reference a fixed DLL name
# in their import table. By instead using a stable name it is possible to
# upgrade between library versions, without needing to re-link an application.
# For more details see: https://github.com/xianyi/OpenBLAS/issues/127.
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)

View File

@ -48,7 +48,7 @@ HPLOBJS = \
COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX)
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif

View File

@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (GEMM_UNROLL_N <= 8) {
if (0 && GEMM_UNROLL_N <= 8) {
LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda,
@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (GEMM_UNROLL_N <= 8) {
if (0 && GEMM_UNROLL_N <= 8) {
printf("helllo\n");
LASWP_NCOPY(min_jj, off + 1, off + k,
b + (- off + jjs * lda) * COMPSIZE, lda,

View File

@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#if 0
LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda,
ipiv, sb + k * (jjs - js) * COMPSIZE);
#else
LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
#ifdef COMPLEX
ZERO,
#endif
c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE);
#endif
for (is = 0; is < k; is += GEMM_P) {
min_i = k - is;
if (min_i > GEMM_P) min_i = GEMM_P;

View File

@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
min_jj = js + jmin - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#if 0
#if 1
LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO,
#ifdef COMPLEX
ZERO,

View File

@ -48,7 +48,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1;
FLOAT *b1, *b2;
@ -58,13 +58,34 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --;
#ifndef MINUS
ipiv += k1
;
ipiv += k1;
#else
ipiv -= (k2 - 1) * incx;
#endif
if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
j = n;
if (j > 0) {
@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = (rows >> 1);
i--;
//Main Loop
while (i > 0) {
#ifdef OPTERON
#ifndef MINUS
asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a1 -= 2;
#endif
i --;
} while (i > 0);
}
//Loop Ending
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
i = ((k2 - k1) & 1);
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1;
B1 = *b1;
*a1 = B1;

View File

@ -50,7 +50,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1, *a3;
FLOAT *b1, *b2, *b3, *b4;
@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --;
#ifndef MINUS
ipiv += k1
;
ipiv += k1;
#else
ipiv -= (k2 - 1) * incx;
#endif
@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (n <= 0) return 0;
j = (n >> 1);
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
if (j > 0) {
do {
piv = ipiv;
@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
i = ((k2 - k1) >> 1);
i = ((rows) >> 1);
if (i > 0) {
do {
// Loop pipeline
i--;
//Main Loop
while (i > 0) {
#ifdef CORE2
#ifndef MINUS
asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a3 -= 2;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
#else
a1 -= 2;
a3 -= 2;
#endif
//Remain
i = ((rows) & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
A1 = *a1;
B1 = *b1;
A3 = *a3;
@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
i = ((rows) >> 1);
i --;
while (i > 0) {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
if (b1 == a1) {
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a2 = B1;
*b1 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == a2) {
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
*b2 = A2;
}
}
b1 = a + ip1;
b2 = a + ip2;
b1 = a + ip1;
b2 = a + ip2;
#ifndef MINUS
a1 += 2;
a1 += 2;
#else
a1 -= 2;
a1 -= 2;
#endif
i --;
} while (i > 0);
i --;
}
i = ((k2 - k1) & 1);
//Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1;
B1 = *b1;
*a1 = B1;

View File

@ -54,7 +54,7 @@
int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1, *a3, *a5, *a7;
FLOAT *b1, *b2, *b3, *b4;
@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
k1 --;
#ifndef MINUS
ipiv += k1
;
ipiv += k1;
#else
ipiv -= (k2 - 1) * incx;
#endif
if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 2);
if (j > 0) {
do {
@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i--; //Loop pipeline
//Main Loop
while (i > 0) {
A1 = *a1;
A2 = *a2;
A3 = *a3;
@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a7 -= 2;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
A5 = *a5;
A6 = *a6;
A7 = *a7;
A8 = *a8;
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
B5 = *b5;
B6 = *b6;
B7 = *b7;
B8 = *b8;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
*a5 = A6;
*a6 = A5;
*a7 = A8;
*a8 = A7;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
*a6 = B6;
*b6 = A6;
*a8 = B8;
*b8 = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
*a5 = A6;
*a6 = A5;
*a7 = A8;
*a8 = A7;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
*a5 = A6;
*a6 = B6;
*b6 = A5;
*a7 = A8;
*a8 = B8;
*b8 = A7;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
*a5 = A6;
*a6 = B5;
*b5 = A5;
*a7 = A8;
*a8 = B7;
*b7 = A7;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
*a5 = B5;
*b5 = A5;
*a7 = B7;
*b7 = A7;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
*a5 = B5;
*a6 = A5;
*b5 = A6;
*a7 = B7;
*a8 = A7;
*b7 = A8;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
*a5 = B5;
*a6 = B6;
*b5 = A5;
*b6 = A6;
*a7 = B7;
*a8 = B8;
*b7 = A7;
*b8 = A8;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
a5 += 2;
a7 += 2;
#else
a1 -= 2;
a3 -= 2;
a5 -= 2;
a7 -= 2;
#endif
//Remain
i = ((rows) & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
b5 = b1 + 2 * lda;
b7 = b1 + 3 * lda;
A1 = *a1;
B1 = *b1;
A3 = *a3;
@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = ((rows) >> 1);
i--;
while (i > 0) {
A1 = *a1;
A2 = *a2;
A3 = *a3;
@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a3 -= 2;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
*a3 = A4;
*a4 = B4;
*b4 = A3;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
*a3 = A4;
*a4 = B3;
*b3 = A3;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
*a3 = B3;
*a4 = A3;
*b3 = A4;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
*a3 = B3;
*a4 = B4;
*b3 = A3;
*b4 = A4;
}
}
#ifndef MINUS
a1 += 2;
a3 += 2;
#else
a1 -= 2;
a3 -= 2;
#endif
i = ((rows) & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
b3 = b1 + 1 * lda;
A1 = *a1;
B1 = *b1;
A3 = *a3;
@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
i = ((rows) >> 1);
i --;
while (i > 0) {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
if (b1 == a1) {
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a2 = B1;
*b1 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == a2) {
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
*b2 = A2;
}
}
b1 = a + ip1;
b2 = a + ip2;
b1 = a + ip1;
b2 = a + ip2;
#ifndef MINUS
a1 += 2;
a1 += 2;
#else
a1 -= 2;
a1 -= 2;
#endif
i --;
} while (i > 0);
i --;
}
i = ((k2 - k1) & 1);
//Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
} else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*a1 = A2;
*a2 = A1;
} else {
*a1 = A2;
*a2 = B2;
*b2 = A1;
}
}
} else {
if (b2 == a1) {
*a1 = A2;
*a2 = B1;
*b1 = A1;
} else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
} else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
*b1 = A2;
} else {
*a1 = B1;
*a2 = B2;
*b1 = A1;
*b2 = A2;
}
}
#ifndef MINUS
a1 += 2;
#else
a1 -= 2;
#endif
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
A1 = *a1;
B1 = *b1;
*a1 = B1;

File diff suppressed because it is too large Load Diff

View File

@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1;
FLOAT *b1, *b2;
@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = n;
if (j > 0) {
@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i --;
//Loop pipeline
//Main Loop
while (i > 0) {
#ifdef OPTERON
#ifndef MINUS
asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1));
@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);

View File

@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1;
FLOAT *b1, *b2;
@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 1);
if (j > 0) {
@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = (rows >> 1);
i--;
//Loop pipeline
//Main Loop
while (i > 0) {
#ifdef CORE2
#ifndef MINUS
asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a1 + 0 + lda);
A6 = *(a1 + 1 + lda);
A7 = *(a2 + 0 + lda);
A8 = *(a2 + 1 + lda);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b1 + 0 + lda);
B6 = *(b1 + 1 + lda);
B7 = *(b2 + 0 + lda);
B8 = *(b2 + 1 + lda);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b2 + 0 + lda) = A5;
*(b2 + 1 + lda) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a1 + 0 + lda) = A7;
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = B5;
*(a2 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
*(b1 + 0 + lda) = A7;
*(b1 + 1 + lda) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a1 + 0 + lda) = B5;
*(a1 + 1 + lda) = B6;
*(a2 + 0 + lda) = B7;
*(a2 + 1 + lda) = B8;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a1 + 0 + lda);
@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = (rows >> 1);
i--;
//Loop pipeline
//Main Loop
while (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4;
#endif
i --;
} while (i > 0);
}
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
i = ((k2 - k1) & 1);
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);

View File

@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2;
BLASLONG i, j, ip1, ip2, rows;
blasint *piv;
FLOAT *a1, *a3, *a5, *a7;
FLOAT *b1, *b2, *b3, *b4;
@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
if (n <= 0) return 0;
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
//Only have 1 row
ip1 = *ipiv * 2;
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
b1 = a + ip1;
if(a1 == b1) return 0;
for(j=0; j<n; j++){
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
a1 += lda;
b1 += lda;
}
return 0;
}
j = (n >> 2);
if (j > 0) {
@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = (rows >> 1);
i--;
//Loop pipeline
//Main Loop
while (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a7 -= 4;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
A9 = *(a5 + 0);
A10 = *(a5 + 1);
A11 = *(a6 + 0);
A12 = *(a6 + 1);
A13 = *(a7 + 0);
A14 = *(a7 + 1);
A15 = *(a8 + 0);
A16 = *(a8 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
B9 = *(b5 + 0);
B10 = *(b5 + 1);
B11 = *(b6 + 0);
B12 = *(b6 + 1);
B13 = *(b7 + 0);
B14 = *(b7 + 1);
B15 = *(b8 + 0);
B16 = *(b8 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b6 + 0) = A11;
*(b6 + 1) = A12;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b6 + 0) = A9;
*(b6 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b8 + 0) = A13;
*(b8 + 1) = A14;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(a5 + 0) = A11;
*(a5 + 1) = A12;
*(a6 + 0) = B9;
*(a6 + 1) = B10;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(a7 + 0) = A15;
*(a7 + 1) = A16;
*(a8 + 0) = B13;
*(a8 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(a6 + 0) = A9;
*(a6 + 1) = A10;
*(b5 + 0) = A11;
*(b5 + 1) = A12;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
*(b7 + 0) = A15;
*(b7 + 1) = A16;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
*(a5 + 0) = B9;
*(a5 + 1) = B10;
*(a6 + 0) = B11;
*(a6 + 1) = B12;
*(b5 + 0) = A9;
*(b5 + 1) = A10;
*(b6 + 0) = A11;
*(b6 + 1) = A12;
*(a7 + 0) = B13;
*(a7 + 1) = B14;
*(a8 + 0) = B15;
*(a8 + 1) = B16;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
}
#ifndef MINUS
a1 += 4;
a3 += 4;
a5 += 4;
a7 += 4;
#else
a1 -= 4;
a3 -= 4;
a5 -= 4;
a7 -= 4;
#endif
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
b3 = b1 + 1 * lda;
b5 = b1 + 2 * lda;
b7 = b1 + 3 * lda;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a3 + 0);
@ -435,37 +717,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b3 = b1 + lda;
b4 = b2 + lda;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
i = (rows >> 1);
i--;
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
//Loop pipeline
//Main Loop
while (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
if (b1 == a1) {
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
}
b1 = a + ip1;
b2 = a + ip2;
b3 = b1 + lda;
b4 = b2 + lda;
#ifndef MINUS
a1 += 4;
a3 += 4;
#else
a1 -= 4;
a3 -= 4;
#endif
i --;
}
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
@ -474,122 +924,96 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 != a2) {
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b4 + 0) = A5;
*(b4 + 1) = A6;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = A7;
*(a3 + 1) = A8;
*(a4 + 0) = B5;
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
*(b3 + 0) = A7;
*(b3 + 1) = A8;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(a3 + 0) = B5;
*(a3 + 1) = B6;
*(a4 + 0) = B7;
*(a4 + 1) = B8;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
}
b1 = a + ip1;
b2 = a + ip2;
b3 = b1 + lda;
b4 = b2 + lda;
}
#ifndef MINUS
a1 += 4;
a3 += 4;
a1 += 4;
a3 += 4;
#else
a1 -= 4;
a3 -= 4;
a1 -= 4;
a3 -= 4;
#endif
i --;
} while (i > 0);
}
i = ((k2 - k1) & 1);
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
b3 = b1 + lda;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a3 + 0);
@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
b1 = a + ip1;
b2 = a + ip2;
i = ((k2 - k1) >> 1);
if (i > 0) {
do {
i = (rows >> 1);
i--;
//Loop pipeline
//Main Loop
while (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
a1 -= 4;
#endif
i --;
} while (i > 0);
}
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
} else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
} else {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A1;
*(b2 + 1) = A2;
}
}
} else {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = B1;
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
} else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
*(b1 + 0) = A3;
*(b1 + 1) = A4;
} else {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
}
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
i = ((k2 - k1) & 1);
//Remain
i = (rows & 1);
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
A1 = *(a1 + 0);
A2 = *(a1 + 1);
B1 = *(b1 + 0);

View File

@ -16,12 +16,17 @@ LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif
ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif
ifndef LASWP
LASWP = ../generic/laswp_k_1.c
LASWP = ../generic/laswp_k.c
endif
ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k_1.c
ZLASWP = ../generic/zlaswp_k.c
endif
include ../generic/Makefile

View File

@ -21,12 +21,17 @@ LASWP = ../generic/laswp_k_1.c
ZLASWP = ../generic/zlaswp_k_1.c
endif
ifeq ($(DYNAMIC_ARCH), 1)
LASWP = ../generic/laswp_k_4.c
ZLASWP = ../generic/zlaswp_k_4.c
endif
ifndef LASWP
LASWP = ../generic/laswp_k_1.c
LASWP = ../generic/laswp_k.c
endif
ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k_1.c
ZLASWP = ../generic/zlaswp_k.c
endif
include ../generic/Makefile

View File

@ -191,7 +191,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \
+ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \
+ slarrv.$(SUFFIX) slartv.$(SUFFIX) \
+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \
+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \
+ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \
+ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \
+ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \
@ -345,7 +345,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \
+ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \
+ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \
+ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \
+ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \
+ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \
+ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \
+ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \
@ -484,7 +484,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \
+ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \
+ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \
+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \
+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \
+ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \
+ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \
+ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \
@ -643,7 +643,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile
+ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \
+ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \
+ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \
+ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \
+ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \
+ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \
+ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \
+ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \

View File

@ -1,3 +1,4 @@
#!/bin/bash
echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio "
make BINARY=32 CC=gcc FC=gfortran

View File

@ -1,3 +1,4 @@
#!/bin/bash
echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio "
make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran