diff --git a/Changelog.txt b/Changelog.txt index 4e80473d6..3d6151bb6 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.3 +20-Aug-2012 +common: + * Fixed LAPACK unstable bug about ?laswp. (#130) + * Fixed the shared library bug about unloading the library on + Linux (#132). + * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) + Please use gcc and IBM xlf. (#134) +x86/x86-64: + * Supported goto_set_num_threads and openblas_set_num_threads + APIs in Windows. They can set the number of threads on runtime. + ==================================================================== Version 0.2.2 6-July-2012 diff --git a/Makefile b/Makefile index 796217291..d95373086 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ include ./Makefile.system BLASDIRS = interface driver/level2 driver/level3 driver/others -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) BLASDIRS += kernel endif @@ -99,11 +99,9 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif tests : @@ -147,7 +145,7 @@ ifeq ($(EXPRECISION), 1) echo "#define EXPRECISION">> config_last.h endif ## -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ @@ -165,7 +163,7 @@ prof_blas : $(MAKE) -C $$d prof || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonprof || exit 1 endif @@ -184,7 +182,7 @@ hpl : $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ @@ -233,7 +231,7 @@ ifndef NOFORTRAN -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif diff --git a/Makefile.rule b/Makefile.rule index 85abf584b..57094377a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.2 +VERSION = 0.2.3 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -108,19 +108,16 @@ VERSION = 0.2.2 # The installation directory. # PREFIX = /opt/OpenBLAS -# Common Optimization Flag; -O2 is enough. -# DEBUG = 1 - -ifeq ($(DEBUG), 1) -COMMON_OPT += -g -# -DDEBUG -else -COMMON_OPT += -O2 -endif +# Common Optimization Flag; +# The default -O2 is enough. +# COMMON_OPT = -O2 # Profiling flags COMMON_PROF = -pg +# Build Debug version +# DEBUG = 1 + # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 425cbb68a..b2180f30d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -244,7 +244,7 @@ endif endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO @@ -687,6 +687,15 @@ AWK = awk REVISION = -r$(VERSION) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) +ifeq ($(DEBUG), 1) +COMMON_OPT += -g +endif + +ifndef COMMON_OPT +COMMON_OPT = -O2 +endif + + CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) @@ -705,7 +714,7 @@ ifndef LIBSUFFIX LIBSUFFIX = a endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) ifndef SMP LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) @@ -724,8 +733,8 @@ endif endif +LIBDLLNAME = $(LIBPREFIX).dll LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/README.md b/README.md index 52d098366..befc14fc0 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. +* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## Specification of Git Branches diff --git a/common_interface.h b/common_interface.h index dbe0bb851..cc5771daa 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,7 +45,7 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); -void BLASFUNC(openblas_set_num_threads)(int *); +void openblas_set_num_threads_(int *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/driver/others/Makefile b/driver/others/Makefile index 2fdbb4a42..921f47c9c 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -14,7 +14,7 @@ endif # COMMONOBJS += info.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) COMMONOBJS += dynamic.$(SUFFIX) else COMMONOBJS += parameter.$(SUFFIX) @@ -70,7 +70,7 @@ ifndef BLAS_SERVER BLAS_SERVER = blas_server.c endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 66067a05c..f16b827d3 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -435,7 +435,7 @@ static int blas_thread_server(void *arg){ blas_memory_free(buffer); - pthread_exit(NULL); + //pthread_exit(NULL); return 0; } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index c71e7c276..9cbd7e219 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,13 +63,7 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; -void goto_set_num_threads(int num) -{ -} -void openblas_set_num_threads(int num) -{ -} static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ @@ -187,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ do { action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); + } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); if (action == WAIT_OBJECT_0 + 1) break; @@ -271,7 +265,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else { legacy_exec(routine, queue -> mode, queue -> args, sb); } - } + }else{ + continue; //if queue == NULL + } #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); @@ -433,7 +429,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ -int blas_thread_shutdown_(void){ +int BLASFUNC(blas_thread_shutdown)(void){ int i; @@ -445,7 +441,7 @@ int blas_thread_shutdown_(void){ SetEvent(pool.killed); - for(i = 0; i < blas_cpu_number - 1; i++){ + for(i = 0; i < blas_num_threads - 1; i++){ WaitForSingleObject(blas_threads[i], INFINITE); } @@ -456,3 +452,47 @@ int blas_thread_shutdown_(void){ return 0; } + +void goto_set_num_threads(int num_threads) +{ + long i; + + if (num_threads < 1) num_threads = blas_cpu_number; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + //increased_threads = 1; + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + blas_server_avail = 1; + } + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; +} + +void openblas_set_num_threads(int num) +{ + goto_set_num_threads(num); +} \ No newline at end of file diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 0364d0374..aa4b867fd 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -60,6 +60,8 @@ extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_BARCELONA; +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BOBCAT; #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -122,15 +124,24 @@ static gotoblas_t *get_coretype(void){ if (model == 12) return &gotoblas_ATOM; return NULL; - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; - //Intel Xeon Processor 5600 (Westmere-EP) - if (model == 12) return &gotoblas_NEHALEM; - return NULL; + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + if (model == 12 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) return &gotoblas_SANDYBRIDGE; + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10) return &gotoblas_SANDYBRIDGE; + return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; @@ -144,7 +155,9 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else { + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else { return &gotoblas_BARCELONA; } } @@ -178,6 +191,8 @@ static char *corename[] = { "Opteron(SSE3)", "Barcelona", "Nano", + "Sandybridge", + "Bobcat", }; char *gotoblas_corename(void) { @@ -197,7 +212,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_NANO) return corename[15]; - + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + return corename[0]; } diff --git a/driver/others/memory.c b/driver/others/memory.c index af9b54eff..d897fe7e0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -206,7 +206,15 @@ int get_num_procs(void) { #endif +/* +OpenBLAS uses the numbers of CPU cores in multithreading. +It can be set by openblas_set_num_threads(int num_threads); +*/ int blas_cpu_number = 0; +/* +The numbers of threads in the thread pool. +This value is equal or large than blas_cpu_number. This means some threads are sleep. +*/ int blas_num_threads = 0; int goto_get_num_procs (void) { @@ -1289,6 +1297,7 @@ void DESTRUCTOR gotoblas_quit(void) { moncontrol (1); #endif + blas_shutdown(); } #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 27de83ffc..5e24cfcc7 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern void openblas_set_num_threads(int num_threads) ; -void NAME(int* num_threads){ +void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } @@ -46,7 +46,7 @@ void NAME(int* num_threads){ void openblas_set_num_threads(int num_threads) { } -void NAME(int* num_threads){ +void openblas_set_num_threads_(int* num_threads){ } #endif diff --git a/exports/Makefile b/exports/Makefile index 40a3a7c63..c507032e9 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -66,6 +66,11 @@ dll : ../$(LIBDLLNAME) dll2 : libgoto2_shared.dll +# On Windows, we only generate a DLL without a version suffix. This is because +# applications which link against the dynamic library reference a fixed DLL name +# in their import table. By instead using a stable name it is possible to +# upgrade between library versions, without needing to re-link an application. +# For more details see: https://github.com/xianyi/OpenBLAS/issues/127. ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) diff --git a/kernel/Makefile b/kernel/Makefile index aed145b60..41c5e89fd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,7 +48,7 @@ HPLOBJS = \ COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 0db93da92..6f6672099 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, @@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { + printf("helllo\n"); LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index b637e6db5..4922b9b52 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#if 0 LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); +#else + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); +#endif + for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index a761dee4c..fcea0ae89 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#if 0 +#if 1 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index c19017631..1b0db5f8c 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -58,13 +58,34 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + + i--; + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a1 -= 2; #endif i --; - } while (i > 0); } + + //Loop Ending + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 1105aee82..8a8a89bd1 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; @@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif @@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (n <= 0) return 0; j = (n >> 1); + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { do { piv = ipiv; @@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); + i = ((rows) >> 1); - if (i > 0) { - do { + // Loop pipeline + i--; + + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index e08d49667..86ee949c4 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { do { @@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG i = ((k2 - k1) >> 1); - if (i > 0) { - do { + i--; //Loop pipeline + //Main Loop + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a7 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = ((rows) >> 1); + i--; + + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index a4d4bce99..e3a05dbcc 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -60,9 +60,9 @@ #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, - FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *a9, *a11, *a13, *a15; @@ -79,13 +79,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 3); if (j > 0) { do { @@ -129,50 +151,51 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + i = (rows >> 1); + i--; + //Loop pipeline + //Main Loop + while (i > 0) { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - B9 = *b9; - B10 = *b10; - B11 = *b11; - B12 = *b12; - B13 = *b13; - B14 = *b14; - B15 = *b15; - B16 = *b16; + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - A9 = *a9; - A10 = *a10; - A11 = *a11; - A12 = *a12; - A13 = *a13; - A14 = *a14; - A15 = *a15; - A16 = *a16; + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; if (b1 == a1) { if (b2 == a1) { @@ -371,51 +394,316 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; - b9 = b1 + 4 * lda; - b10 = b2 + 4 * lda; - b11 = b1 + 5 * lda; - b12 = b2 + 5 * lda; - b13 = b1 + 6 * lda; - b14 = b2 + 6 * lda; - b15 = b1 + 7 * lda; - b16 = b2 + 7 * lda; + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; - a9 += 2; - a11 += 2; - a13 += 2; - a15 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; - a9 -= 2; - a11 -= 2; - a13 -= 2; - a15 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + b9 = b1 + 4 * lda; + b11 = b1 + 5 * lda; + b13 = b1 + 6 * lda; + b15 = b1 + 7 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -487,35 +775,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + i = (rows >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; @@ -524,150 +982,120 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; } else - if (b2 != a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; *a2 = B2; + *b1 = A1; *b2 = A2; + *a3 = B3; *a4 = B4; + *b3 = A3; *b4 = A4; + *a5 = B5; *a6 = B6; + *b5 = A5; *b6 = A6; + *a7 = B7; *a8 = B8; + *b7 = A7; *b8 = A8; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - *a5 = A6; - *a6 = A5; - *a7 = A8; - *a8 = A7; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - *a5 = A6; - *a6 = B6; - *b6 = A5; - *a7 = A8; - *a8 = B8; - *b8 = A7; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - *a5 = A6; - *a6 = B5; - *b5 = A5; - *a7 = A8; - *a8 = B7; - *b7 = A7; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - *a5 = B5; - *a6 = A5; - *b5 = A6; - *a7 = B7; - *a8 = A7; - *b7 = A8; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - *a5 = B5; - *a6 = B6; - *b5 = A5; - *b6 = A6; - *a7 = B7; - *a8 = B8; - *b7 = A7; - *b8 = A8; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; - -#ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; -#else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; -#endif - i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); - - if (i > 0) { - A1 = *a1; - B1 = *b1; - A3 = *a3; - B3 = *b3; - A5 = *a5; - B5 = *b5; - A7 = *a7; - B7 = *b7; - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + i = (rows & 1); - a += 4 * lda; + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; } if (n & 2) { @@ -692,109 +1120,194 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; + i = ((rows) >> 1); + i--; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; } else - if (b2 != a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; *a2 = B2; + *b1 = A1; *b2 = A2; + *a3 = B3; *a4 = B4; + *b3 = A3; *b4 = A4; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - + } #ifndef MINUS - a1 += 2; - a3 += 2; + a1 += 2; + a3 += 2; #else - a1 -= 2; - a3 -= 2; + a1 -= 2; + a3 -= 2; #endif - i --; - } while (i > 0); - } - - i = ((k2 - k1) & 1); + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -825,78 +1338,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index 3dd653baf..7a62dd9b8 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b2 = a + ip2; i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i --; + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); + } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index a877ef66b..0fa685859 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 1); if (j > 0) { @@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a1 + 0 + lda); @@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 4dc559895..c63a8e2e0 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { @@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a7 -= 4; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -435,37 +717,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b3 = b1 + lda; b4 = b2 + lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *(a1 + 0); - A2 = *(a1 + 1); - A3 = *(a2 + 0); - A4 = *(a2 + 1); - - A5 = *(a3 + 0); - A6 = *(a3 + 1); - A7 = *(a4 + 0); - A8 = *(a4 + 1); + i = (rows >> 1); + i--; - B1 = *(b1 + 0); - B2 = *(b1 + 1); - B3 = *(b2 + 0); - B4 = *(b2 + 1); + //Loop pipeline + //Main Loop + while (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); - B5 = *(b3 + 0); - B6 = *(b3 + 1); - B7 = *(b4 + 0); - B8 = *(b4 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); - ip1 = *piv * 2; - piv += incx; - ip2 = *piv * 2; - piv += incx; + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); - if (b1 == a1) { + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + i --; + } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; @@ -474,122 +924,96 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; } else - if (b2 != a2) { + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - } else { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A1; - *(b2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A5; - *(b4 + 1) = A6; - } - } - } else { - if (b2 == a1) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B1; - *(a2 + 1) = B2; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B5; - *(a4 + 1) = B6; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - } else - if (b2 == a2) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - } else - if (b2 == b1) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(b1 + 0) = A3; - *(b1 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - *(b3 + 0) = A7; - *(b3 + 1) = A8; - } else { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + lda; - b4 = b2 + lda; + } #ifndef MINUS - a1 += 4; - a3 += 4; + a1 += 4; + a3 += 4; #else - a1 -= 4; - a3 -= 4; + a1 -= 4; + a3 -= 4; #endif - i --; - } while (i > 0); - } - - i = ((k2 - k1) & 1); + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + + b1 = a + ip1; + b3 = b1 + lda; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile index 105ec4027..434c82a84 100644 --- a/lapack/laswp/x86/Makefile +++ b/lapack/laswp/x86/Makefile @@ -16,12 +16,17 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile index ba07dcf4f..e6dae5344 100644 --- a/lapack/laswp/x86_64/Makefile +++ b/lapack/laswp/x86_64/Makefile @@ -21,12 +21,17 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 index 79c74aad2..ff4954b09 100644 --- a/patch.for_lapack-3.4.1 +++ b/patch.for_lapack-3.4.1 @@ -191,7 +191,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ + slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ + slarrv.$(SUFFIX) slartv.$(SUFFIX) \ -+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ + slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ + sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ + sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ @@ -345,7 +345,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ + clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ + clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ -+ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ + clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ + cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ + cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ @@ -484,7 +484,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ + dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ + dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ -+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ + dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ + dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ + dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ @@ -643,7 +643,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ + zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ + zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ -+ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ + zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ + zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ + zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ diff --git a/quickbuild.win32 b/quickbuild.win32 index 29949c192..3d7db1770 100644 --- a/quickbuild.win32 +++ b/quickbuild.win32 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=32 CC=gcc FC=gfortran diff --git a/quickbuild.win64 b/quickbuild.win64 index 88f748a8d..8f0189435 100644 --- a/quickbuild.win64 +++ b/quickbuild.win64 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran