commit
b87a77da02
|
@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
|
||||||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||||
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
|
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
|
||||||
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
|
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
|
||||||
|
PKG_EXTRALIB := $(EXTRALIB)
|
||||||
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
ifeq ($(C_COMPILER), PGI)
|
||||||
|
PKG_EXTRALIB += -lomp
|
||||||
|
else
|
||||||
|
PKG_EXTRALIB += -lgomp
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
.PHONY : install
|
.PHONY : install
|
||||||
.NOTPARALLEL : install
|
.NOTPARALLEL : install
|
||||||
|
@ -147,7 +155,7 @@ endif
|
||||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,10 @@ else ifeq ($(ARCH), powerpc)
|
||||||
override ARCH=power
|
override ARCH=power
|
||||||
else ifeq ($(ARCH), i386)
|
else ifeq ($(ARCH), i386)
|
||||||
override ARCH=x86
|
override ARCH=x86
|
||||||
|
else ifeq ($(ARCH), armv6)
|
||||||
|
override ARCH=arm
|
||||||
|
else ifeq ($(ARCH), armv7)
|
||||||
|
override ARCH=arm
|
||||||
else ifeq ($(ARCH), aarch64)
|
else ifeq ($(ARCH), aarch64)
|
||||||
override ARCH=arm64
|
override ARCH=arm64
|
||||||
else ifeq ($(ARCH), zarch)
|
else ifeq ($(ARCH), zarch)
|
||||||
|
|
|
@ -7,5 +7,5 @@ Name: OpenBLAS
|
||||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||||
Version: @OPENBLAS_VERSION@
|
Version: @OPENBLAS_VERSION@
|
||||||
URL: https://github.com/xianyi/OpenBLAS
|
URL: https://github.com/xianyi/OpenBLAS
|
||||||
Libs: -L${libdir} -lopenblas${libsuffix}
|
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
|
@ -54,14 +54,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||||
endif()
|
endif()
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||||
set(X86 1)
|
set(X86 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
||||||
set(ARM 1)
|
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
|
||||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||||
set(ARM64 1)
|
set(ARM64 1)
|
||||||
else()
|
else()
|
||||||
set(ARM 1)
|
set(ARM 1)
|
||||||
endif()
|
endif()
|
||||||
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||||
|
set(ARM 1)
|
||||||
elseif (${CMAKE_CROSSCOMPILING})
|
elseif (${CMAKE_CROSSCOMPILING})
|
||||||
if (${TARGET} STREQUAL "CORE2")
|
if (${TARGET} STREQUAL "CORE2")
|
||||||
if (NOT BINARY)
|
if (NOT BINARY)
|
||||||
|
|
|
@ -197,6 +197,8 @@ int detect(void)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
return CPU_ARMV8;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
|
|
|
@ -145,7 +145,7 @@ int detect(void){
|
||||||
if (implementation >= 0x40000u) return CPUTYPE_POWER10;
|
if (implementation >= 0x40000u) return CPUTYPE_POWER10;
|
||||||
else if (implementation & 0x20000) return CPUTYPE_POWER9;
|
else if (implementation & 0x20000) return CPUTYPE_POWER9;
|
||||||
else if (implementation & 0x10000) return CPUTYPE_POWER8;
|
else if (implementation & 0x10000) return CPUTYPE_POWER8;
|
||||||
else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7
|
else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7
|
||||||
else if (implementation & 0x04000) return CPUTYPE_POWER6;
|
else if (implementation & 0x04000) return CPUTYPE_POWER6;
|
||||||
else if (implementation & 0x02000) return CPUTYPE_POWER5;
|
else if (implementation & 0x02000) return CPUTYPE_POWER5;
|
||||||
else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450
|
else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450
|
||||||
|
|
|
@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for schedule(OMP_SCHED)
|
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
|
|
@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
rowC[0] += result[1] * alpha;
|
rowC[0] += result[1] * alpha;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SET_ACC_ZERO4() \
|
|
||||||
__builtin_mma_xxsetaccz (&acc0); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc1); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc2); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc3);
|
|
||||||
|
|
||||||
#define SET_ACC_ZERO8() \
|
|
||||||
__builtin_mma_xxsetaccz (&acc0); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc1); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc2); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc3); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc4); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc5); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc6); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc7);
|
|
||||||
|
|
||||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||||
|
|
||||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||||
|
@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
PREFETCH1 (CO + ldc + ldc, 128);
|
PREFETCH1 (CO + ldc + ldc, 128);
|
||||||
PREFETCH1 (CO + ldc + ldc + ldc, 128);
|
PREFETCH1 (CO + ldc + ldc + ldc, 128);
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
SET_ACC_ZERO8 ();
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
for (l = 0; l < temp; l++)
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||||
|
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||||
|
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||||
|
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||||
|
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & BO[l << 2];
|
||||||
vec_t *rb = (vec_t *) & BO[l << 2];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3;
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
SET_ACC_ZERO4 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & BO[l << 2];
|
||||||
vec_t *rb = (vec_t *) & BO[l << 2];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & BO[l << 2];
|
||||||
vec_t *rb = (vec_t *) & BO[l << 2];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0;
|
__vector_quad acc0;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 1];
|
rowA = (vec_t *) & AO[l << 1];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & BO[l << 2];
|
||||||
vec_t *rb = (vec_t *) & BO[l << 2];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
}
|
}
|
||||||
|
@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
SET_ACC_ZERO8 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||||
|
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||||
|
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||||
|
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||||
|
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & t[0];
|
||||||
vec_t *rb = (vec_t *) & t[0];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3;
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
SET_ACC_ZERO4 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & t[0];
|
||||||
vec_t *rb = (vec_t *) & t[0];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & t[0];
|
||||||
vec_t *rb = (vec_t *) & t[0];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
}
|
}
|
||||||
|
@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0;
|
__vector_quad acc0;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
__vector_pair rowB;
|
rb = (vec_t *) & t[0];
|
||||||
vec_t *rb = (vec_t *) & t[0];
|
|
||||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 1];
|
rowA = (vec_t *) & AO[l << 1];
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
}
|
}
|
||||||
SAVE2x4_ACC (&acc0, 0);
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
|
|
@ -134,21 +134,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
__builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
|
__builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
|
||||||
__builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
|
__builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
|
||||||
__builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
|
__builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
|
||||||
#define SET_ACC_ZERO4() \
|
|
||||||
__builtin_mma_xxsetaccz (&acc0); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc1); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc2); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc3);
|
|
||||||
|
|
||||||
#define SET_ACC_ZERO8() \
|
|
||||||
__builtin_mma_xxsetaccz (&acc0); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc1); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc2); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc3); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc4); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc5); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc6); \
|
|
||||||
__builtin_mma_xxsetaccz (&acc7);
|
|
||||||
|
|
||||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||||
|
|
||||||
|
@ -249,8 +234,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
SET_ACC_ZERO8 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
|
vec_t *rowA1 = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB1 = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]);
|
||||||
|
__builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]);
|
||||||
|
AO += 16;
|
||||||
|
BO += 8;
|
||||||
|
temp--;
|
||||||
BLASLONG K = temp / 64;
|
BLASLONG K = temp / 64;
|
||||||
for (l = 0; l < K; l++)
|
for (l = 0; l < K; l++)
|
||||||
{
|
{
|
||||||
|
@ -454,12 +451,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3;
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
SET_ACC_ZERO4 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 3];
|
rowB = (vec_t *) & BO[l << 3];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
|
||||||
|
@ -489,13 +491,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 3];
|
rowB = (vec_t *) & BO[l << 3];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
||||||
}
|
}
|
||||||
|
@ -522,15 +526,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v2sf_t *rowC;
|
v2sf_t *rowC;
|
||||||
v2sf_t result[8];
|
v2sf_t result[8];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = AO[0], t[1] = AO[1];
|
||||||
|
vec_t *rowA = (vec_t *) & t[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
|
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
|
||||||
vec_t *rowA = (vec_t *) & t[0];
|
rowA = (vec_t *) & t[0];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 3];
|
rowB = (vec_t *) & BO[l << 3];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
|
||||||
}
|
}
|
||||||
|
@ -625,13 +632,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
FLOAT *A1;
|
FLOAT *A1;
|
||||||
A1 = AO + (16 * k);
|
A1 = AO + (16 * k);
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
SET_ACC_ZERO8 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < k; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowA1 = (vec_t *) & A1[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
|
||||||
|
__builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
|
||||||
|
for (l = 1; l < k; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
vec_t *rowA1 = (vec_t *) & A1[l << 4];
|
rowA1 = (vec_t *) & A1[l << 4];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 2];
|
rowB = (vec_t *) & BO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
||||||
|
@ -673,12 +690,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3;
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
SET_ACC_ZERO4 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 2];
|
rowB = (vec_t *) & BO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
||||||
|
@ -710,13 +732,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 2];
|
rowB = (vec_t *) & BO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
}
|
}
|
||||||
|
@ -742,12 +766,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
__vector_quad acc0;
|
__vector_quad acc0;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 2];
|
rowB = (vec_t *) & BO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
}
|
}
|
||||||
SAVE_ACC (&acc0, 0);
|
SAVE_ACC (&acc0, 0);
|
||||||
|
@ -771,14 +797,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v2sf_t *rowC;
|
v2sf_t *rowC;
|
||||||
v2sf_t result[8];
|
v2sf_t result[8];
|
||||||
__vector_quad acc0;
|
__vector_quad acc0;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = AO[0], t[1] = AO[1];
|
||||||
|
vec_t *rowA = (vec_t *) & t[0];
|
||||||
|
vec_t *rowB = (vec_t *) & BO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
|
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
|
||||||
vec_t *rowA = (vec_t *) & t[0];
|
rowA = (vec_t *) & t[0];
|
||||||
vec_t *rowB = (vec_t *) & BO[l << 2];
|
rowB = (vec_t *) & BO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
}
|
}
|
||||||
SAVE4x2_ACC (&acc0, 0);
|
SAVE4x2_ACC (&acc0, 0);
|
||||||
|
@ -856,15 +885,26 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
FLOAT *A1;
|
FLOAT *A1;
|
||||||
A1 = AO + (16 * k);
|
A1 = AO + (16 * k);
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
SET_ACC_ZERO8 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < k; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
vec_t *rowB = (vec_t *) & t[0];
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
vec_t *rowA1 = (vec_t *) & A1[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
|
||||||
|
__builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
|
||||||
|
for (l = 1; l < k; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
vec_t *rowB = (vec_t *) & t[0];
|
rowB = (vec_t *) & t[0];
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
vec_t *rowA1 = (vec_t *) & A1[l << 4];
|
rowA1 = (vec_t *) & A1[l << 4];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
||||||
|
@ -897,7 +937,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1, acc2, acc3;
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
SET_ACC_ZERO4 ();
|
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (16, 2)
|
REFRESH_POINTERS (16, 2)
|
||||||
|
@ -905,12 +944,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
vec_t *rowB = (vec_t *) & t[0];
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
|
||||||
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
vec_t *rowB = (vec_t *) & t[0];
|
rowB = (vec_t *) & t[0];
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 4];
|
rowA = (vec_t *) & AO[l << 4];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
|
||||||
|
@ -934,8 +980,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0, acc1;
|
__vector_quad acc0, acc1;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
__builtin_mma_xxsetaccz (&acc1);
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 2)
|
REFRESH_POINTERS (8, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -943,12 +987,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
vec_t *rowB = (vec_t *) & t[0];
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
vec_t *rowB = (vec_t *) & t[0];
|
rowB = (vec_t *) & t[0];
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
|
||||||
}
|
}
|
||||||
|
@ -968,7 +1017,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
v4sf_t *rowC;
|
v4sf_t *rowC;
|
||||||
v4sf_t result[4];
|
v4sf_t result[4];
|
||||||
__vector_quad acc0;
|
__vector_quad acc0;
|
||||||
__builtin_mma_xxsetaccz (&acc0);
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 2)
|
REFRESH_POINTERS (4, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -976,12 +1024,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
for (l = 0; l < temp; l++)
|
FLOAT t[4] = { 0 };
|
||||||
|
t[0] = BO[0], t[1] = BO[1];
|
||||||
|
vec_t *rowB = (vec_t *) & t[0];
|
||||||
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
|
||||||
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
FLOAT t[4] = { 0 };
|
|
||||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
vec_t *rowB = (vec_t *) & t[0];
|
rowB = (vec_t *) & t[0];
|
||||||
vec_t *rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
|
||||||
}
|
}
|
||||||
SAVE2x4_ACC (&acc0, 0);
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
|
|
@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
|
static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||||
{
|
{
|
||||||
zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
|
zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
|
||||||
|
|
|
@ -3650,45 +3650,45 @@ void LAPACK_zggrqf(
|
||||||
lapack_int* info );
|
lapack_int* info );
|
||||||
|
|
||||||
#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
|
#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
|
||||||
lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int n, lapack_int p,
|
lapack_int* m, lapack_int* n, lapack_int* p,
|
||||||
lapack_int* k, lapack_int* l, float* a,
|
lapack_int* k, lapack_int* l, float* a,
|
||||||
lapack_int lda, float* b, lapack_int ldb,
|
lapack_int* lda, float* b, lapack_int* ldb,
|
||||||
float* alpha, float* beta, float* u, lapack_int ldu,
|
float* alpha, float* beta, float* u, lapack_int* ldu,
|
||||||
float* v, lapack_int ldv, float* q, lapack_int ldq,
|
float* v, lapack_int* ldv, float* q, lapack_int* ldq,
|
||||||
lapack_int* iwork );
|
float* work, lapack_int* iwork, lapack_int* info );
|
||||||
|
|
||||||
#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
|
#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
|
||||||
lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int n, lapack_int p,
|
lapack_int* m, lapack_int* n, lapack_int* p,
|
||||||
lapack_int* k, lapack_int* l, double* a,
|
lapack_int* k, lapack_int* l, double* a,
|
||||||
lapack_int lda, double* b, lapack_int ldb,
|
lapack_int* lda, double* b, lapack_int* ldb,
|
||||||
double* alpha, double* beta, double* u,
|
double* alpha, double* beta, double* u,
|
||||||
lapack_int ldu, double* v, lapack_int ldv, double* q,
|
lapack_int* ldu, double* v, lapack_int* ldv, double* q,
|
||||||
lapack_int ldq, lapack_int* iwork );
|
lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info );
|
||||||
|
|
||||||
#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
|
#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
|
||||||
lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int n, lapack_int p,
|
lapack_int* m, lapack_int* n, lapack_int* p,
|
||||||
lapack_int* k, lapack_int* l,
|
lapack_int* k, lapack_int* l,
|
||||||
lapack_complex_float* a, lapack_int lda,
|
lapack_complex_float* a, lapack_int* lda,
|
||||||
lapack_complex_float* b, lapack_int ldb,
|
lapack_complex_float* b, lapack_int* ldb,
|
||||||
float* alpha, float* beta, lapack_complex_float* u,
|
float* alpha, float* beta, lapack_complex_float* u,
|
||||||
lapack_int ldu, lapack_complex_float* v,
|
lapack_int* ldu, lapack_complex_float* v,
|
||||||
lapack_int ldv, lapack_complex_float* q,
|
lapack_int* ldv, lapack_complex_float* q,
|
||||||
lapack_int ldq, lapack_int* iwork );
|
lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info );
|
||||||
|
|
||||||
#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
|
#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
|
||||||
lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int n, lapack_int p,
|
lapack_int* m, lapack_int* n, lapack_int* p,
|
||||||
lapack_int* k, lapack_int* l,
|
lapack_int* k, lapack_int* l,
|
||||||
lapack_complex_double* a, lapack_int lda,
|
lapack_complex_double* a, lapack_int* lda,
|
||||||
lapack_complex_double* b, lapack_int ldb,
|
lapack_complex_double* b, lapack_int* ldb,
|
||||||
double* alpha, double* beta,
|
double* alpha, double* beta,
|
||||||
lapack_complex_double* u, lapack_int ldu,
|
lapack_complex_double* u, lapack_int* ldu,
|
||||||
lapack_complex_double* v, lapack_int ldv,
|
lapack_complex_double* v, lapack_int* ldv,
|
||||||
lapack_complex_double* q, lapack_int ldq,
|
lapack_complex_double* q, lapack_int* ldq,
|
||||||
lapack_int* iwork );
|
float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info );
|
||||||
|
|
||||||
#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3)
|
#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3)
|
||||||
void LAPACK_cggsvd3(
|
void LAPACK_cggsvd3(
|
||||||
|
@ -3753,41 +3753,49 @@ void LAPACK_zggsvd3(
|
||||||
lapack_int* info );
|
lapack_int* info );
|
||||||
|
|
||||||
#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
|
#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
|
||||||
lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int p, lapack_int n, float* a,
|
lapack_int* m, lapack_int* p, lapack_int* n, float* a,
|
||||||
lapack_int lda, float* b, lapack_int ldb, float tola,
|
lapack_int* lda, float* b, lapack_int* ldb, float* tola,
|
||||||
float tolb, lapack_int* k, lapack_int* l, float* u,
|
float* tolb, lapack_int* k, lapack_int* l, float* u,
|
||||||
lapack_int ldu, float* v, lapack_int ldv, float* q,
|
lapack_int* ldu, float* v, lapack_int* ldv, float* q,
|
||||||
lapack_int ldq );
|
lapack_int* ldq, lapack_int* iwork, float* tau,
|
||||||
|
float* work, lapack_int* info);
|
||||||
|
|
||||||
#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
|
#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
|
||||||
lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int p, lapack_int n, double* a,
|
lapack_int* m, lapack_int* p, lapack_int* n, double* a,
|
||||||
lapack_int lda, double* b, lapack_int ldb,
|
lapack_int* lda, double* b, lapack_int* ldb,
|
||||||
double tola, double tolb, lapack_int* k,
|
double* tola, double* tolb, lapack_int* k,
|
||||||
lapack_int* l, double* u, lapack_int ldu, double* v,
|
lapack_int* l, double* u, lapack_int* ldu, double* v,
|
||||||
lapack_int ldv, double* q, lapack_int ldq );
|
lapack_int* ldv, double* q, lapack_int* ldq,
|
||||||
|
lapack_int* iwork, double* tau, double* work,
|
||||||
|
lapack_int* info);
|
||||||
|
|
||||||
#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
|
#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
|
||||||
lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int p, lapack_int n,
|
lapack_int* m, lapack_int* p, lapack_int* n,
|
||||||
lapack_complex_float* a, lapack_int lda,
|
lapack_complex_float* a, lapack_int* lda,
|
||||||
lapack_complex_float* b, lapack_int ldb, float tola,
|
lapack_complex_float* b, lapack_int* ldb, float* tola,
|
||||||
float tolb, lapack_int* k, lapack_int* l,
|
float* tolb, lapack_int* k, lapack_int* l,
|
||||||
lapack_complex_float* u, lapack_int ldu,
|
lapack_complex_float* u, lapack_int* ldu,
|
||||||
lapack_complex_float* v, lapack_int ldv,
|
lapack_complex_float* v, lapack_int* ldv,
|
||||||
lapack_complex_float* q, lapack_int ldq );
|
lapack_complex_float* q, lapack_int* ldq,
|
||||||
|
lapack_int* iwork, lapack_int* rwork,
|
||||||
|
lapack_complex_float* tau, lapack_complex_float* work,
|
||||||
|
lapack_int* info);
|
||||||
|
|
||||||
#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
|
#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
|
||||||
lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq,
|
lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq,
|
||||||
lapack_int m, lapack_int p, lapack_int n,
|
lapack_int* m, lapack_int* p, lapack_int* n,
|
||||||
lapack_complex_double* a, lapack_int lda,
|
lapack_complex_double* a, lapack_int* lda,
|
||||||
lapack_complex_double* b, lapack_int ldb,
|
lapack_complex_double* b, lapack_int* ldb,
|
||||||
double tola, double tolb, lapack_int* k,
|
double* tola, double* tolb, lapack_int* k,
|
||||||
lapack_int* l, lapack_complex_double* u,
|
lapack_int* l, lapack_complex_double* u,
|
||||||
lapack_int ldu, lapack_complex_double* v,
|
lapack_int* ldu, lapack_complex_double* v,
|
||||||
lapack_int ldv, lapack_complex_double* q,
|
lapack_int* ldv, lapack_complex_double* q,
|
||||||
lapack_int ldq );
|
lapack_int* ldq, lapack_int* iwork, lapack_int* rwork,
|
||||||
|
lapack_complex_double* tau, lapack_complex_double* work,
|
||||||
|
lapack_int* info);
|
||||||
|
|
||||||
#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3)
|
#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3)
|
||||||
void LAPACK_cggsvp3(
|
void LAPACK_cggsvp3(
|
||||||
|
|
Loading…
Reference in New Issue