diff --git a/Makefile.install b/Makefile.install index 01c0b1226..7c1a3ca43 100644 --- a/Makefile.install +++ b/Makefile.install @@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig +PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(USE_OPENMP), 1) + ifeq ($(C_COMPILER), PGI) + PKG_EXTRALIB += -lomp + else + PKG_EXTRALIB += -lgomp + endif +endif .PHONY : install .NOTPARALLEL : install @@ -147,7 +155,7 @@ endif @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" diff --git a/Makefile.system b/Makefile.system index 2286d14f2..e7d3dc4ce 100644 --- a/Makefile.system +++ b/Makefile.system @@ -25,6 +25,10 @@ else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 +else ifeq ($(ARCH), armv6) +override ARCH=arm +else ifeq ($(ARCH), armv7) +override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 else ifeq ($(ARCH), zarch) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index df4b2ab06..0bd49f996 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -7,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 4382ffc4e..511a7c7d1 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -54,14 +54,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") - set(ARM 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() set(ARM 1) endif() +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM 1) elseif (${CMAKE_CROSSCOMPILING}) if (${TARGET} STREQUAL "CORE2") if (NOT BINARY) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 6f41be604..1fd43148a 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -197,6 +197,8 @@ int detect(void) } +#else + return CPU_ARMV8; #endif return CPU_UNKNOWN; diff --git a/cpuid_power.c b/cpuid_power.c index df3dc8668..b17493bc8 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -145,7 +145,7 @@ int detect(void){ if (implementation >= 0x40000u) return CPUTYPE_POWER10; else if (implementation & 0x20000) return CPUTYPE_POWER9; else if (implementation & 0x10000) return CPUTYPE_POWER8; - else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7 else if (implementation & 0x04000) return CPUTYPE_POWER6; else if (implementation & 0x02000) return CPUTYPE_POWER5; else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index b4eb27c25..d9969b599 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(OMP_SCHED) +#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index a0bc1a777..b2a29140e 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); rowC[0] += result[1] * alpha; #endif -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); - #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, PREFETCH1 (CO + ldc + ldc, 128); PREFETCH1 (CO + ldc + ldc + ldc, 128); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } @@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 1]; + rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE2x4_ACC (&acc0, 0); diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 81a5ec76b..9fbf84695 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -134,21 +134,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); @@ -249,8 +234,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; + vec_t *rowA1 = (vec_t *) & AO[0]; + vec_t *rowB1 = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]); + __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]); + __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]); + __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]); + __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]); + AO += 16; + BO += 8; + temp--; BLASLONG K = temp / 64; for (l = 0; l < K; l++) { @@ -454,12 +451,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); @@ -489,13 +491,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -522,15 +526,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -625,13 +632,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -673,12 +690,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -710,13 +732,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -742,12 +766,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; __vector_quad acc0; v4sf_t result[4]; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE_ACC (&acc0, 0); @@ -771,14 +797,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE4x2_ACC (&acc0, 0); @@ -856,15 +885,26 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -897,7 +937,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; #if defined(TRMMKERNEL) REFRESH_POINTERS (16, 2) @@ -905,12 +944,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BO = B; temp = k; #endif - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -934,8 +980,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); #if defined(TRMMKERNEL) REFRESH_POINTERS (8, 2) #else @@ -943,12 +987,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -968,7 +1017,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); #if defined(TRMMKERNEL) REFRESH_POINTERS (4, 2) #else @@ -976,12 +1024,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE2x4_ACC (&acc0, 0); diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 90fd86daf..1bc785ac1 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO #if defined(SMP) static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, -BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { zdot_compute(n, x, inc_x, y, inc_y, (void *)result); diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 36e53ec24..4f48b7c87 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3650,45 +3650,45 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, float* a, - lapack_int lda, float* b, lapack_int ldb, - float* alpha, float* beta, float* u, lapack_int ldu, - float* v, lapack_int ldv, float* q, lapack_int ldq, - lapack_int* iwork ); + lapack_int* lda, float* b, lapack_int* ldb, + float* alpha, float* beta, float* u, lapack_int* ldu, + float* v, lapack_int* ldv, float* q, lapack_int* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, double* a, - lapack_int lda, double* b, lapack_int ldb, + lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta, double* u, - lapack_int ldu, double* v, lapack_int ldv, double* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, double* v, lapack_int* ldv, double* q, + lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* alpha, float* beta, lapack_complex_float* u, - lapack_int ldu, lapack_complex_float* v, - lapack_int ldv, lapack_complex_float* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, lapack_complex_float* v, + lapack_int* ldv, lapack_complex_float* q, + lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int ldu, - lapack_complex_double* v, lapack_int ldv, - lapack_complex_double* q, lapack_int ldq, - lapack_int* iwork ); + lapack_complex_double* u, lapack_int* ldu, + lapack_complex_double* v, lapack_int* ldv, + lapack_complex_double* q, lapack_int* ldq, + float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,41 +3753,49 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, float* a, - lapack_int lda, float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int ldu, float* v, lapack_int ldv, float* q, - lapack_int ldq ); +lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, float* a, + lapack_int* lda, float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, float* u, + lapack_int* ldu, float* v, lapack_int* ldv, float* q, + lapack_int* ldq, lapack_int* iwork, float* tau, + float* work, lapack_int* info); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, double* a, - lapack_int lda, double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int ldu, double* v, - lapack_int ldv, double* q, lapack_int ldq ); +lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, double* a, + lapack_int* lda, double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, + lapack_int* l, double* u, lapack_int* ldu, double* v, + lapack_int* ldv, double* q, lapack_int* ldq, + lapack_int* iwork, double* tau, double* work, + lapack_int* info); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int ldu, - lapack_complex_float* v, lapack_int ldv, - lapack_complex_float* q, lapack_int ldq ); +lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int* ldu, + lapack_complex_float* v, lapack_int* ldv, + lapack_complex_float* q, lapack_int* ldq, + lapack_int* iwork, lapack_int* rwork, + lapack_complex_float* tau, lapack_complex_float* work, + lapack_int* info); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, +lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, - lapack_int ldu, lapack_complex_double* v, - lapack_int ldv, lapack_complex_double* q, - lapack_int ldq ); + lapack_int* ldu, lapack_complex_double* v, + lapack_int* ldv, lapack_complex_double* q, + lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, + lapack_complex_double* tau, lapack_complex_double* work, + lapack_int* info); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3(