diff --git a/Makefile.x86 b/Makefile.x86 index 0e27264d8..893379c33 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,10 +1,21 @@ # COMPILER_PREFIX = mingw32- -ifdef HAVE_SSE -CCOMMON_OPT += -msse -FCOMMON_OPT += -msse +ifndef DYNAMIC_ARCH +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif endif +ifdef ADD_CPUFLAGS +ifdef HAVE_SSE +CCOMMON_OPT += -msse +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -msse +endif +endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 5406494c9..f62ab9e5e 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,16 @@ endif endif endif + +ifndef DYNAMIC_ARCH +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 ifneq ($(F_COMPILER), NAG) @@ -44,7 +54,6 @@ endif endif ifeq ($(CORE), SKYLAKEX) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 ifneq ($(F_COMPILER), NAG) @@ -62,10 +71,8 @@ endif endif endif endif -endif ifeq ($(CORE), COOPERLAKE) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 @@ -88,7 +95,6 @@ endif endif endif endif -endif ifdef HAVE_AVX2 ifndef NO_AVX2 @@ -120,6 +126,7 @@ endif endif endif +endif ifeq ($(OSNAME), Interix) diff --git a/cmake/system.cmake b/cmake/system.cmake index eee429113..d6c71b774 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -299,6 +299,10 @@ if (NO_AVX2) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") endif () +if (NO_AVX512) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif () + if (USE_THREAD) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 6c68ba98a..0b623c3ac 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__ ("mrs %0, "#id : "=r" (var)); \ + __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 594b1a35a..873653f1e 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -186,7 +186,7 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t_power10.c diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index e918e61c3..cdd846891 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c new file mode 100644 index 000000000..f5bb8d70e --- /dev/null +++ b/kernel/power/zgemv_n_power10.c @@ -0,0 +1,1102 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif +#endif + +// +#define NBMAX 4096 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] - result[1][1]; \ + result[0][1] = result[0][1] + result[1][0]; \ + result[1][0] = result[2][0] - result[3][1]; \ + result[1][1] = result[2][1] + result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#else +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] + result[1][1]; \ + result[0][1] = result[0][1] - result[1][0]; \ + result[1][0] = result[2][0] + result[3][1]; \ + result[1][1] = result[2][1] - result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#endif + +static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector_quad acc0, acc1, acc2, acc3; + v4sf_t result[4]; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + + register __vector double vx0_r = {x[0], x[1]}; + register __vector double vx1_r = {x[2], x[3]}; + register __vector double vx2_r = {x[4], x[5]}; + register __vector double vx3_r = {x[6], x[7]}; + register __vector double vx4_r = {x[8], x[9]}; + register __vector double vx5_r = {x[10], x[11]}; + register __vector double vx6_r = {x[12], x[13]}; + register __vector double vx7_r = {x[14], x[15]}; + __vector_pair *Va0, *Va1, *Va2, *Va3; + __vector_pair *Va4, *Va5, *Va6, *Va7; + BLASLONG i = 0, i2 = 0; + v4sf_t *rowC; + BLASLONG tmp = (n / 8) * 8; + for (i = 0; i < tmp; i += 8) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + __builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + SAVE_RESULT(&acc2, 8); + SAVE_RESULT(&acc3, 12); + } + while (i < n) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + i += 4; + } +} +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; +#endif + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + + register __vector double vy_0; + register __vector double va0; + register __vector double va1; + register __vector double va2; + register __vector double va3; + register __vector double vy_1; + register __vector double va0_1; + register __vector double va1_1; + register __vector double va2_1; + register __vector double va3_1; + register __vector double vy_2; + register __vector double va0_2; + register __vector double va1_2; + register __vector double va2_2; + register __vector double va3_2; + register __vector double vy_3; + register __vector double va0_3; + register __vector double va1_3; + register __vector double va2_3; + register __vector double va3_3; + + BLASLONG i = 0; + while (i < n) { + + vy_0 = vy[i]; + va0 = vptr_a0[i]; + va1 = vptr_a1[i]; + va2 = vptr_a2[i]; + va3 = vptr_a3[i]; + + vy_1 = vy[i + 1]; + va0_1 = vptr_a0[i + 1]; + va1_1 = vptr_a1[i + 1]; + va2_1 = vptr_a2[i + 1]; + va3_1 = vptr_a3[i + 1]; + + vy_2 = vy[i + 2]; + va0_2 = vptr_a0[i + 2]; + va1_2 = vptr_a1[i + 2]; + va2_2 = vptr_a2[i + 2]; + va3_2 = vptr_a3[i + 2]; + + vy_3 = vy[i + 3]; + va0_3 = vptr_a0[i + 3]; + va1_3 = vptr_a1[i + 3]; + va2_3 = vptr_a2[i + 3]; + va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_xxpermdi(va2, va2, 2); + va2_1 = vec_xxpermdi(va2_1, va2_1, 2); + + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + + va2_2 = vec_xxpermdi(va2_2, va2_2, 2); + va2_3 = vec_xxpermdi(va2_3, va2_3, 2); + + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_xxpermdi(va3, va3, 2); + va3_1 = vec_xxpermdi(va3_1, va3_1, 2); + + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + + va3_2 = vec_xxpermdi(va3_2, va3_2, 2); + va3_3 = vec_xxpermdi(va3_3, va3_3, 2); + + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + + i += 4; + + + } + +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; +#endif + + } +} + +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + +#else + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; +#endif + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t = 0; + BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[dest_t]; + register __vector double vy_1 = vptr_y1[dest_t]; + register __vector double vy_2 = vptr_y2[dest_t]; + register __vector double vy_3 = vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t += add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[i]; + register __vector double vy_1 = vptr_y[i + 1]; + register __vector double vy_2 = vptr_y[i + 2]; + register __vector double vy_3 = vptr_y[i + 3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; +} + +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + if (inc_dest != 2) { + + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; +#else + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; +#endif + + *dest += temp_r; + *(dest + 1) += temp_i; + + src += 2; + dest += inc_dest; + } + return; + } + + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif + + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; + } + return; +} +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT xbuffer[16] __attribute__((aligned(16))); + FLOAT *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 8; + n2 = n % 8; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 3; + x_ptr += 16; + } + if (n2 & 4) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[8] = x_ptr[0]; + xbuffer[9] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[10] = x_ptr[0]; + xbuffer[11] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[12] = x_ptr[0]; + xbuffer[13] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[14] = x_ptr[0]; + xbuffer[15] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 3; + } + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; +#endif + + a_ptr += 4; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += 2; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + + if (lda == 4 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; +#endif + + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += 4; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += 6; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; +#endif + return (0); + } + + return (0); +} + diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 956d75ffc..d3bf60ca7 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector_quad acc0, acc1, acc2, acc3;; + __vector_quad acc4, acc5, acc6, acc7; + v4sf_t result[4]; + __vector_pair *Va0, *Va1, *Va2, *Va3; + i = 0; + n = n << 1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + while (i < n) { + + vec_t *rx = (vec_t *) & x[i]; + Va0 = ((__vector_pair*)((void*)&a0[i])); + Va1 = ((__vector_pair*)((void*)&a1[i])); + Va2 = ((__vector_pair*)((void*)&a2[i])); + Va3 = ((__vector_pair*)((void*)&a3[i])); + + __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]); + i += 8; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] - result[1][1]; + register FLOAT temp_i0 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] - result[3][1]; + temp_i0 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] - result[1][1]; + register FLOAT temp_i1 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] - result[3][1]; + temp_i1 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] - result[1][1]; + register FLOAT temp_i2 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] - result[3][1]; + temp_i2 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] - result[1][1]; + register FLOAT temp_i3 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] - result[3][1]; + temp_i3 += result[2][1] + result[3][0]; +#else + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] + result[1][1]; + register FLOAT temp_i0 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] + result[3][1]; + temp_i0 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] + result[1][1]; + register FLOAT temp_i1 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] + result[3][1]; + temp_i1 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] + result[1][1]; + register FLOAT temp_i2 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] + result[3][1]; + temp_i2 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] + result[1][1]; + register FLOAT temp_i3 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] + result[3][1]; + temp_i3 += result[2][1] - result[3][0]; +#endif +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; +#endif +} +#else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; FLOAT *a0, *a1, *a2, *a3; @@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA #endif } +#endif #else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index f3d614242..2db8b2fea 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#if defined(__clang__) + for(;n_count>23;n_count-=24) COMPUTE(24) +#else for(;n_count>23;n_count-=24) COMPUTE_n24 +#endif for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) for(;n_count>11;n_count-=12) COMPUTE(12) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 4725e7169..bcf5acd0b 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -319,14 +319,14 @@ REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -351,6 +351,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -364,6 +365,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -377,6 +379,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index 61b016f16..1c277c6bb 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, HALF, ONE + DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index e678305f2..375645b75 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - REAL ZERO, HALF, ONE + REAL ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ TWO = 2.0E+0 ) REAL MULTPL diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b28ae47a4..960244727 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -319,7 +319,7 @@ DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. @@ -352,6 +352,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -365,6 +366,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -378,6 +380,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF *