commit
fc101b67e5
17
Makefile.x86
17
Makefile.x86
|
@ -1,10 +1,21 @@
|
||||||
# COMPILER_PREFIX = mingw32-
|
# COMPILER_PREFIX = mingw32-
|
||||||
|
|
||||||
ifdef HAVE_SSE
|
ifndef DYNAMIC_ARCH
|
||||||
CCOMMON_OPT += -msse
|
ADD_CPUFLAGS = 1
|
||||||
FCOMMON_OPT += -msse
|
else
|
||||||
|
ifdef TARGET_CORE
|
||||||
|
ADD_CPUFLAGS = 1
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef ADD_CPUFLAGS
|
||||||
|
ifdef HAVE_SSE
|
||||||
|
CCOMMON_OPT += -msse
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -msse
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
ARFLAGS = -m x86
|
ARFLAGS = -m x86
|
||||||
|
|
|
@ -8,6 +8,16 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
ifndef DYNAMIC_ARCH
|
||||||
|
ADD_CPUFLAGS = 1
|
||||||
|
else
|
||||||
|
ifdef TARGET_CORE
|
||||||
|
ADD_CPUFLAGS = 1
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef ADD_CPUFLAGS
|
||||||
ifdef HAVE_SSE3
|
ifdef HAVE_SSE3
|
||||||
CCOMMON_OPT += -msse3
|
CCOMMON_OPT += -msse3
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
@ -44,7 +54,6 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), SKYLAKEX)
|
ifeq ($(CORE), SKYLAKEX)
|
||||||
ifndef DYNAMIC_ARCH
|
|
||||||
ifndef NO_AVX512
|
ifndef NO_AVX512
|
||||||
CCOMMON_OPT += -march=skylake-avx512
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
@ -62,10 +71,8 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(CORE), COOPERLAKE)
|
ifeq ($(CORE), COOPERLAKE)
|
||||||
ifndef DYNAMIC_ARCH
|
|
||||||
ifndef NO_AVX512
|
ifndef NO_AVX512
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
# cooperlake support was added in 10.1
|
# cooperlake support was added in 10.1
|
||||||
|
@ -88,7 +95,6 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef HAVE_AVX2
|
ifdef HAVE_AVX2
|
||||||
ifndef NO_AVX2
|
ifndef NO_AVX2
|
||||||
|
@ -120,6 +126,7 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
|
|
|
@ -299,6 +299,10 @@ if (NO_AVX2)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (NO_AVX512)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (USE_THREAD)
|
if (USE_THREAD)
|
||||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||||
# NO_AFFINITY = 1
|
# NO_AFFINITY = 1
|
||||||
|
|
|
@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define get_cpu_ftr(id, var) ({ \
|
#define get_cpu_ftr(id, var) ({ \
|
||||||
__asm__ ("mrs %0, "#id : "=r" (var)); \
|
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||||
})
|
})
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
|
|
@ -186,7 +186,7 @@ ZSWAPKERNEL = zswap.c
|
||||||
SGEMVNKERNEL = sgemv_n.c
|
SGEMVNKERNEL = sgemv_n.c
|
||||||
DGEMVNKERNEL = dgemv_n_power10.c
|
DGEMVNKERNEL = dgemv_n_power10.c
|
||||||
CGEMVNKERNEL = cgemv_n.c
|
CGEMVNKERNEL = cgemv_n.c
|
||||||
ZGEMVNKERNEL = zgemv_n_4.c
|
ZGEMVNKERNEL = zgemv_n_power10.c
|
||||||
#
|
#
|
||||||
SGEMVTKERNEL = sgemv_t.c
|
SGEMVTKERNEL = sgemv_t.c
|
||||||
DGEMVTKERNEL = dgemv_t_power10.c
|
DGEMVTKERNEL = dgemv_t_power10.c
|
||||||
|
|
|
@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
|
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
|
||||||
__vector_pair rowB, rowB1;
|
__vector_pair rowB, rowB1;
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
rowB1 = *((__vector_pair *)((void *)&BO[4]));
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||||
|
@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
rb = (vec_t *) & BO[l << 3];
|
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||||
|
@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
__vector_pair rowB, rowB1;
|
__vector_pair rowB, rowB1;
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB1 = *((__vector_pair *)((void *)&BO[4]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||||
|
@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
rb = (vec_t *) & BO[l << 3];
|
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||||
|
@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
__vector_pair rowB, rowB1;
|
__vector_pair rowB, rowB1;
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB1 = *((__vector_pair *)((void *)&BO[4]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 1];
|
rowA = (vec_t *) & AO[l << 1];
|
||||||
rb = (vec_t *) & BO[l << 3];
|
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
|
||||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||||
}
|
}
|
||||||
|
@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
__vector_pair rowB;
|
__vector_pair rowB;
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||||
|
@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 3];
|
rowA = (vec_t *) & AO[l << 3];
|
||||||
rb = (vec_t *) & BO[l << 2];
|
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
__vector_pair rowB;
|
__vector_pair rowB;
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 2];
|
rowA = (vec_t *) & AO[l << 2];
|
||||||
rb = (vec_t *) & BO[l << 2];
|
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
}
|
}
|
||||||
|
@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
BLASLONG l = 0;
|
BLASLONG l = 0;
|
||||||
vec_t *rowA = (vec_t *) & AO[0];
|
vec_t *rowA = (vec_t *) & AO[0];
|
||||||
__vector_pair rowB;
|
__vector_pair rowB;
|
||||||
vec_t *rb = (vec_t *) & BO[0];
|
rowB = *((__vector_pair *)((void *)&BO[0]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||||
for (l = 1; l < temp; l++)
|
for (l = 1; l < temp; l++)
|
||||||
{
|
{
|
||||||
rowA = (vec_t *) & AO[l << 1];
|
rowA = (vec_t *) & AO[l << 1];
|
||||||
rb = (vec_t *) & BO[l << 2];
|
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
|
||||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
|
||||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
}
|
}
|
||||||
SAVE_ACC (&acc0, 0);
|
SAVE_ACC (&acc0, 0);
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#elif HAVE_KERNEL_4x4_VEC
|
#elif HAVE_KERNEL_4x4_VEC
|
||||||
|
|
||||||
|
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||||
|
typedef __vector unsigned char vec_t;
|
||||||
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
|
|
||||||
|
|
||||||
|
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||||
|
BLASLONG i;
|
||||||
|
FLOAT *a0, *a1, *a2, *a3;
|
||||||
|
a0 = ap;
|
||||||
|
a1 = ap + lda;
|
||||||
|
a2 = a1 + lda;
|
||||||
|
a3 = a2 + lda;
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3;;
|
||||||
|
__vector_quad acc4, acc5, acc6, acc7;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_pair *Va0, *Va1, *Va2, *Va3;
|
||||||
|
i = 0;
|
||||||
|
n = n << 1;
|
||||||
|
__builtin_mma_xxsetaccz (&acc0);
|
||||||
|
__builtin_mma_xxsetaccz (&acc1);
|
||||||
|
__builtin_mma_xxsetaccz (&acc2);
|
||||||
|
__builtin_mma_xxsetaccz (&acc3);
|
||||||
|
__builtin_mma_xxsetaccz (&acc4);
|
||||||
|
__builtin_mma_xxsetaccz (&acc5);
|
||||||
|
__builtin_mma_xxsetaccz (&acc6);
|
||||||
|
__builtin_mma_xxsetaccz (&acc7);
|
||||||
|
while (i < n) {
|
||||||
|
|
||||||
|
vec_t *rx = (vec_t *) & x[i];
|
||||||
|
Va0 = ((__vector_pair*)((void*)&a0[i]));
|
||||||
|
Va1 = ((__vector_pair*)((void*)&a1[i]));
|
||||||
|
Va2 = ((__vector_pair*)((void*)&a2[i]));
|
||||||
|
Va3 = ((__vector_pair*)((void*)&a3[i]));
|
||||||
|
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]);
|
||||||
|
i += 8;
|
||||||
|
|
||||||
|
}
|
||||||
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||||
|
register FLOAT temp_r0 = result[0][0] - result[1][1];
|
||||||
|
register FLOAT temp_i0 = result[0][1] + result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc4);
|
||||||
|
temp_r0 += result[2][0] - result[3][1];
|
||||||
|
temp_i0 += result[2][1] + result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc1);
|
||||||
|
register FLOAT temp_r1 = result[0][0] - result[1][1];
|
||||||
|
register FLOAT temp_i1 = result[0][1] + result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc5);
|
||||||
|
temp_r1 += result[2][0] - result[3][1];
|
||||||
|
temp_i1 += result[2][1] + result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc2);
|
||||||
|
register FLOAT temp_r2 = result[0][0] - result[1][1];
|
||||||
|
register FLOAT temp_i2 = result[0][1] + result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc6);
|
||||||
|
temp_r2 += result[2][0] - result[3][1];
|
||||||
|
temp_i2 += result[2][1] + result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc3);
|
||||||
|
register FLOAT temp_r3 = result[0][0] - result[1][1];
|
||||||
|
register FLOAT temp_i3 = result[0][1] + result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc7);
|
||||||
|
temp_r3 += result[2][0] - result[3][1];
|
||||||
|
temp_i3 += result[2][1] + result[3][0];
|
||||||
|
#else
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||||
|
register FLOAT temp_r0 = result[0][0] + result[1][1];
|
||||||
|
register FLOAT temp_i0 = result[0][1] - result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc4);
|
||||||
|
temp_r0 += result[2][0] + result[3][1];
|
||||||
|
temp_i0 += result[2][1] - result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc1);
|
||||||
|
register FLOAT temp_r1 = result[0][0] + result[1][1];
|
||||||
|
register FLOAT temp_i1 = result[0][1] - result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc5);
|
||||||
|
temp_r1 += result[2][0] + result[3][1];
|
||||||
|
temp_i1 += result[2][1] - result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc2);
|
||||||
|
register FLOAT temp_r2 = result[0][0] + result[1][1];
|
||||||
|
register FLOAT temp_i2 = result[0][1] - result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc6);
|
||||||
|
temp_r2 += result[2][0] + result[3][1];
|
||||||
|
temp_i2 += result[2][1] - result[3][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc3);
|
||||||
|
register FLOAT temp_r3 = result[0][0] + result[1][1];
|
||||||
|
register FLOAT temp_i3 = result[0][1] - result[1][0];
|
||||||
|
__builtin_mma_disassemble_acc ((void *)result, &acc7);
|
||||||
|
temp_r3 += result[2][0] + result[3][1];
|
||||||
|
temp_i3 += result[2][1] - result[3][0];
|
||||||
|
#endif
|
||||||
|
#if !defined(XCONJ)
|
||||||
|
|
||||||
|
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||||
|
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||||
|
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||||
|
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||||
|
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||||
|
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||||
|
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
|
||||||
|
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||||
|
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||||
|
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||||
|
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||||
|
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||||
|
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||||
|
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
|
||||||
|
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#else
|
||||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
FLOAT *a0, *a1, *a2, *a3;
|
FLOAT *a0, *a1, *a2, *a3;
|
||||||
|
@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||||
|
|
|
@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
|
||||||
int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
|
int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
|
||||||
BLASLONG n_count = n;
|
BLASLONG n_count = n;
|
||||||
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
|
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
|
||||||
|
#if defined(__clang__)
|
||||||
|
for(;n_count>23;n_count-=24) COMPUTE(24)
|
||||||
|
#else
|
||||||
for(;n_count>23;n_count-=24) COMPUTE_n24
|
for(;n_count>23;n_count-=24) COMPUTE_n24
|
||||||
|
#endif
|
||||||
for(;n_count>19;n_count-=20) COMPUTE(20)
|
for(;n_count>19;n_count-=20) COMPUTE(20)
|
||||||
for(;n_count>15;n_count-=16) COMPUTE(16)
|
for(;n_count>15;n_count-=16) COMPUTE(16)
|
||||||
for(;n_count>11;n_count-=12) COMPUTE(12)
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
|
|
@ -319,14 +319,14 @@
|
||||||
REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
|
REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
|
||||||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||||
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
$ CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
|
||||||
$ U12, X, ABI12, Y
|
$ U12, X, ABI12, Y
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
COMPLEX CLADIV
|
COMPLEX CLADIV
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
REAL CLANHS, SLAMCH
|
REAL CLANHS, SLAMCH
|
||||||
EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH
|
EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH
|
||||||
* ..
|
* ..
|
||||||
* .. External Subroutines ..
|
* .. External Subroutines ..
|
||||||
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
|
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
|
||||||
|
@ -351,6 +351,7 @@
|
||||||
ILSCHR = .TRUE.
|
ILSCHR = .TRUE.
|
||||||
ISCHUR = 2
|
ISCHUR = 2
|
||||||
ELSE
|
ELSE
|
||||||
|
ILSCHR = .TRUE.
|
||||||
ISCHUR = 0
|
ISCHUR = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
@ -364,6 +365,7 @@
|
||||||
ILQ = .TRUE.
|
ILQ = .TRUE.
|
||||||
ICOMPQ = 3
|
ICOMPQ = 3
|
||||||
ELSE
|
ELSE
|
||||||
|
ILQ = .TRUE.
|
||||||
ICOMPQ = 0
|
ICOMPQ = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
@ -377,6 +379,7 @@
|
||||||
ILZ = .TRUE.
|
ILZ = .TRUE.
|
||||||
ICOMPZ = 3
|
ICOMPZ = 3
|
||||||
ELSE
|
ELSE
|
||||||
|
ILZ = .TRUE.
|
||||||
ICOMPZ = 0
|
ICOMPZ = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
|
|
@ -139,7 +139,7 @@
|
||||||
* =====================================================================
|
* =====================================================================
|
||||||
*
|
*
|
||||||
* .. Parameters ..
|
* .. Parameters ..
|
||||||
DOUBLE PRECISION ZERO, HALF, ONE
|
DOUBLE PRECISION ZERO, HALF, ONE, TWO
|
||||||
PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
|
PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
|
||||||
$ TWO = 2.0D0 )
|
$ TWO = 2.0D0 )
|
||||||
DOUBLE PRECISION MULTPL
|
DOUBLE PRECISION MULTPL
|
||||||
|
|
|
@ -139,7 +139,7 @@
|
||||||
* =====================================================================
|
* =====================================================================
|
||||||
*
|
*
|
||||||
* .. Parameters ..
|
* .. Parameters ..
|
||||||
REAL ZERO, HALF, ONE
|
REAL ZERO, HALF, ONE, TWO
|
||||||
PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
|
PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
|
||||||
$ TWO = 2.0E+0 )
|
$ TWO = 2.0E+0 )
|
||||||
REAL MULTPL
|
REAL MULTPL
|
||||||
|
|
|
@ -319,7 +319,7 @@
|
||||||
DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
|
DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
|
||||||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||||
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
$ CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
|
||||||
$ U12, X, ABI12, Y
|
$ U12, X, ABI12, Y
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
|
@ -352,6 +352,7 @@
|
||||||
ILSCHR = .TRUE.
|
ILSCHR = .TRUE.
|
||||||
ISCHUR = 2
|
ISCHUR = 2
|
||||||
ELSE
|
ELSE
|
||||||
|
ILSCHR = .TRUE.
|
||||||
ISCHUR = 0
|
ISCHUR = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
@ -365,6 +366,7 @@
|
||||||
ILQ = .TRUE.
|
ILQ = .TRUE.
|
||||||
ICOMPQ = 3
|
ICOMPQ = 3
|
||||||
ELSE
|
ELSE
|
||||||
|
ILQ = .TRUE.
|
||||||
ICOMPQ = 0
|
ICOMPQ = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
@ -378,6 +380,7 @@
|
||||||
ILZ = .TRUE.
|
ILZ = .TRUE.
|
||||||
ICOMPZ = 3
|
ICOMPZ = 3
|
||||||
ELSE
|
ELSE
|
||||||
|
ILZ = .TRUE.
|
||||||
ICOMPZ = 0
|
ICOMPZ = 0
|
||||||
END IF
|
END IF
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue